In [1]:
import pandas as pd
import numpy as np
import nltk
from os import path
import re
import libs as ft
from sklearn.metrics.pairwise import cosine_similarity
#from pyfasttext import FastText
#from gensim.models.wrappers import FastText
import fasttext as ft


import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def cleansing(x, drop_tag, tag_pos, lemmatizer):
    """
    いらない品詞を除外し，レンマ化して返す．apply関数内で使用，

    Args:
        x (Series): apply関数で呼び出されるSeries
        drop_tag (list): いらない品詞リスト(nltk)
        tag_pos (dict): key -> tag, value -> pos. レンマ化の精度向上に使用．
        lemmatizer (nltk.stem.WordNetLemmatizer): lemmatizer

    Returns:
        (str): output sentence
    """
    words = [word for word in x['headline_text'].split(' ') if word != '']  # 空文字入るとエラーになる
    tags = nltk.pos_tag(words)  # 品詞を取得
    words = [(word, tag_pos[tag]) for word, tag in tags if tag not in drop_tag]  # いらない品詞を除外
    print('words:',words)
    words = [lemmatizer.lemmatize(word, pos=pos) for word, pos in words]
    sentence = ' '.join(words)  # 連結
    return sentence


In [3]:
def preprocess(data):
    """
    前処理の関数．

    Args:
        data (DataFrame): input dataset

    Retruns:
        (DataFrame): output dataset
    """
    # まずは，いらない品詞を落とし，レンマ化する．
    # その後，階層クラスタリングのときに使う用のcsvファイルとモデル学習用のtxtファイルを出力する．
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # いらない品詞
    drop_tag = ['$', 'CC', 'CD', 'DT', 'IN', 'MD', 'POS', 'PRP', 'PRP$', 'RP', 'TO' , 'WP', 'WRB']
    # 品詞とpos(lemma用)の変換辞書
    tag_pos = {'FW': 'n', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'RB': 'r', 'RBR': 'r', 'VB': 'v',
               'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 'RBS': 'r',}

    #data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer,)))
    data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer)))

    print('after drop and lemmatization')
    print(data.head())
    data.to_csv('data2.csv', sep='\t', index=False)
    data['preprocessed'].to_csv('text2.txt', index=False)
    return data

In [4]:
def get_word_vector(data_name='text.txt', model_name='./pretrained_model/model.bin'):
    """
    fasttextベースで分散表現を取得する関数．これも見てわかると思うので引数は省略．

    Returns:
        (list of list): 単語リストのリスト．[['word_0_0', 'word_0_1'], ['word_1_0', 'word_1_1', 'word_1_2'], ...]みたいな
        (array): 分散表現 次元=(文章数×分散表現の次元数)
    """
    sentences = []
    with open(data_name, mode='r') as f:
        for line in f.readlines():
            line = re.sub('\n', '', line)
            sentences.append(line.split(' '))

    # modelが12GBくらいメモリを食うので終わったら開放する．
    vec_name =  'sentences_vc2.npy'
    if not path.exists(vec_name):
        #model = FastText.load_fasttext_format(model_name)
        model = ft.load_model(model_name)
        dim = model.get_dimension()
        sentences_vec = np.zeros((dim,))

        for words in sentences:
            vec = np.zeros((dim,))
            for word in words:
                if model.get_word_id(word) == -1:
                    print('this word does not exists in corpus: %s at %s' % (word, words))
                vec = np.vstack((vec, model.get_word_vector(word)))
            vec = vec[1:, :].mean(axis=0)
            sentences_vec = np.vstack((sentences_vec, vec))
        sentences_vec = sentences_vec[1:, :]
        del model

        np.save(vec_name, sentences_vec)
    else:
        sentences_vec = np.load(vec_name)
    return sentences, sentences_vec

In [5]:
if __name__ == '__main__':
    np.random.seed(123)
    # ランダムに50個の記事の題名を取得
    data = pd.read_csv('./abcnews-date-text.csv')
    rand_index = np.random.randint(0, data.shape[0], 400)
    data = data.iloc[rand_index, 1]
data

773630                   holman heroics save socceroos skins
277869        cultural background affects ones health report
28030                row brews over radiation therapy delays
1066306    liberal mp isobel redmond to retire at 2018 st...
194278       waste plan review to consider recycling options
                                 ...                        
646698                         act election countdown begins
353514         search underway for people missing in sa bush
773379         pervez musharraf back in pakistan after exile
928084     man to front court accused of shooting man in ...
655905                               minister backs dec head
Name: headline_text, Length: 400, dtype: object

In [6]:
type(data), data.shape

(pandas.core.series.Series, (400,))

In [7]:
if __name__ == '__main__':
    np.random.seed(123)
    # ランダムに50個の記事の題名を取得
    data = pd.read_csv('./abcnews-date-text.csv')
    rand_index = np.random.randint(0, data.shape[0], 400)
    data = data.iloc[rand_index, 1]
    print('=>raw data:')
    print('=>data.head:', data.head())

    # sent = ' '.join(list(data))
    # words = nltk.word_tokenize(sent)
    # tags = nltk.pos_tag(words)
    # tags = sorted(list(set([tag for word, tag in tags])))
    # for i in tags:
        # print(nltk.help.upenn_tagset(i))
    data = preprocess(pd.DataFrame(data))

    sentences, vec = get_word_vector()
    print(vec.shape)
    #get_similar_sentence(data.iloc[0, 1], data, sentences, vec, 5)

=>raw data:
=>data.head: 773630                   holman heroics save socceroos skins
277869        cultural background affects ones health report
28030                row brews over radiation therapy delays
1066306    liberal mp isobel redmond to retire at 2018 st...
194278       waste plan review to consider recycling options
Name: headline_text, dtype: object
words: [('holman', 'n'), ('heroics', 'n'), ('save', 'v'), ('socceroos', 'n'), ('skins', 'n')]
words: [('cultural', 'a'), ('background', 'n'), ('affects', 'v'), ('ones', 'n'), ('health', 'n'), ('report', 'n')]
words: [('row', 'n'), ('brews', 'n'), ('radiation', 'n'), ('therapy', 'n'), ('delays', 'n')]
words: [('liberal', 'a'), ('mp', 'n'), ('isobel', 'n'), ('redmond', 'n'), ('retire', 'v'), ('state', 'n'), ('election', 'n')]
words: [('waste', 'n'), ('plan', 'n'), ('review', 'v'), ('consider', 'v'), ('recycling', 'v'), ('options', 'n')]
words: [('png', 'a'), ('government', 'n'), ('tactics', 'n'), ("'cowardly", 'r')]
words: [('loc

words: [('national', 'a'), ('disability', 'n'), ('insurance', 'n'), ('scheme', 'n'), ('promise', 'n'), ('check', 'n')]
words: [('gillard', 'n'), ('denies', 'v'), ('scenes', 'n'), ('deal', 'n'), ('greens', 'n')]
words: [('australian', 'a'), ('share', 'n'), ('market', 'n'), ('opens', 'v'), ('higher', 'a')]
words: [('bland', 'n'), ('mou', 'n')]
words: [('wagga', 'a'), ('service', 'n'), ('remembers', 'n'), ('army', 'v'), ('training', 'v'), ('accident', 'n')]
words: [('adam', 'n'), ('giles', 'n'), ('plan', 'v'), ('dump', 'v'), ('bail', 'n'), ('youths', 'n'), ('slammed', 'v'), ('lawyers', 'n')]
words: [('interview', 'n'), ('broadbent', 'n')]
words: [('push', 'n'), ('farmers', 'n'), ('get', 'v'), ('storm', 'n'), ('aid', 'n')]
words: [('vic', 'a'), ('poppies', 'n'), ('ruled', 'v')]
words: [('council', 'n'), ('orders', 'n'), ('environmental', 'a'), ('probe', 'n'), ('proposed', 'v'), ('marina', 'n')]
words: [('abc', 'n'), ('entertainment', 'n')]
words: [('matautaavas', 'n'), ('amazing', 'v'), ('

words: [('go', 'v'), ('unbeaten', 'a'), ('ponting', 'n')]
words: [('interview', 'n'), ('isaac', 'n'), ('luke', 'n')]
words: [('asbestos', 'n'), ('still', 'r'), ('massive', 'a'), ('problem', 'n'), ('nsw', 'a'), ('north', 'a'), ('coast', 'n')]
words: [('zimbabwe', 'n'), ('results', 'n'), ('not', 'r'), ('released', 'v'), ('sub', 'a'), ('judice', 'n'), ('grounds', 'n')]
words: [('hewitt', 'n'), ('cleared', 'v'), ('doubts', 'n'), ('remain', 'v')]
words: [('gunfire', 'n'), ('erupts', 'n'), ('south', 'a'), ('sudan', 'n'), ('ceasefire', 'n'), ('talks', 'n'), ('begin', 'v')]
words: [('colorado', 'n'), ('group', 'n'), ('sale', 'n')]
words: [('martyn', 'n'), ('skipper', 'v'), ('warriors', 'n')]
words: [('queensland', 'n'), ('wont', 'n'), ('give', 'v'), ('bid', 'n'), ('eradicate', 'v'), ('cattle', 'n')]
words: [('iron', 'n'), ('firm', 'n'), ('wary', 'n'), ('resources', 'n'), ('tax', 'n')]
words: [('cats', 'n'), ('take', 'v'), ('points', 'n'), ('fighting', 'v'), ('tigers', 'n')]
words: [('energy', 

(401, 300)


In [8]:
data

Unnamed: 0,headline_text,preprocessed
773630,holman heroics save socceroos skins,holman heroic save socceroos skin
277869,cultural background affects ones health report,cultural background affect one health report
28030,row brews over radiation therapy delays,row brew radiation therapy delay
1066306,liberal mp isobel redmond to retire at 2018 st...,liberal mp isobel redmond retire state election
194278,waste plan review to consider recycling options,waste plan review consider recycle option
...,...,...
646698,act election countdown begins,act election countdown begin
353514,search underway for people missing in sa bush,search underway people miss sa bush
773379,pervez musharraf back in pakistan after exile,pervez musharraf back pakistan exile
928084,man to front court accused of shooting man in ...,man front court accuse shoot man face


In [9]:

df_vec = pd.DataFrame(vec)
df_vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.025192,0.100360,-0.220985,0.216720,-0.375728,-0.136017,0.469670,-0.395489,0.293971,-0.341170,...,0.104766,-0.089663,-0.246274,0.206871,0.549405,-0.171528,-0.151983,0.137993,0.223821,0.065391
1,-0.042216,0.007807,-0.323374,0.239274,-0.171238,-0.051610,-0.085143,0.053878,-0.024637,0.036788,...,-0.028024,-0.056563,0.112036,-0.002028,-0.073680,-0.021632,-0.097797,0.187835,0.106588,0.096950
2,-0.092368,-0.189523,-0.009720,0.183215,-0.029319,-0.023563,-0.049403,-0.176174,0.098500,0.145629,...,0.094414,0.133662,0.143568,-0.100534,0.055244,-0.067328,-0.073239,0.133813,0.188375,-0.091567
3,-0.069223,0.120941,-0.256029,0.106836,-0.199290,-0.129381,0.045415,-0.080313,0.204904,0.049175,...,0.124354,-0.031129,0.090552,-0.103008,-0.142635,-0.183100,0.087854,0.279785,0.337483,0.008798
4,-0.084492,-0.263232,-0.093723,0.007081,0.005129,-0.178735,0.145290,-0.264792,-0.215830,0.267448,...,0.167543,0.006900,0.185153,0.171885,0.002973,-0.154298,-0.139211,-0.134581,0.106265,0.065847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,-0.092496,-0.161918,0.003437,-0.000338,-0.166313,0.000480,-0.029617,-0.036378,-0.172753,-0.019287,...,0.189887,0.086398,-0.027365,0.004651,-0.029130,-0.032107,-0.197571,-0.112846,0.156353,0.233766
397,-0.211835,-0.118695,-0.201009,0.076824,-0.137734,-0.019259,0.097436,-0.302849,-0.101496,-0.075175,...,0.010582,0.061602,-0.042718,-0.019891,-0.079696,-0.068846,-0.277174,0.194296,0.054312,-0.026135
398,0.097125,-0.312537,0.036275,0.334059,-0.105306,-0.130261,-0.055160,-0.418767,-0.419265,0.156148,...,-0.241456,0.164796,0.060799,-0.306069,0.421134,-0.226880,-0.390187,-0.001768,-0.145094,0.127011
399,-0.071388,-0.058593,-0.179957,0.154407,-0.176266,0.012906,-0.027642,-0.047768,0.063605,0.137224,...,0.031463,-0.054614,0.078948,-0.000712,0.001410,-0.121144,-0.231274,0.182183,0.064684,0.041597


In [10]:
len(sentences), sentences

(401,
 [['preprocessed'],
  ['holman', 'heroic', 'save', 'socceroos', 'skin'],
  ['cultural', 'background', 'affect', 'one', 'health', 'report'],
  ['row', 'brew', 'radiation', 'therapy', 'delay'],
  ['liberal', 'mp', 'isobel', 'redmond', 'retire', 'state', 'election'],
  ['waste', 'plan', 'review', 'consider', 'recycle', 'option'],
  ['png', 'government', 'tactic', "'cowardly"],
  ['local', 'council', 'advocate', 'social', 'change'],
  ['climber', 'scale', 'trump', 'tower', 'suction', 'cap'],
  ['larry', 'nassars', 'huge', 'sentence', 'feel', 'victory', 'too'],
  ['security', 'camera', 'help', 'combat', 'stock', 'theft'],
  ['brazil', 'court', 'appeal', 'athens', 'marathon', 'gold'],
  ['lake', 'blue', 'green', 'algae', 'threat', 'remains'],
  ['custom', 'software', 'stay', 'ellison', 'say'],
  ['litchfield', 'take', 'drl', 'premiership'],
  ['bronco', 'go', 'bang', 'whimper'],
  ['geraldton', 'welcome', 'cctv', 'funding'],
  ['minister', 'reassure', 'qld', 'rail', 'wind', 'back'],
  

In [11]:
#RNN => feature vector
# method1: average of dataframe
## this will ignore time sequence, just like CNN classifier, only get the vector

In [12]:
df_vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.025192,0.100360,-0.220985,0.216720,-0.375728,-0.136017,0.469670,-0.395489,0.293971,-0.341170,...,0.104766,-0.089663,-0.246274,0.206871,0.549405,-0.171528,-0.151983,0.137993,0.223821,0.065391
1,-0.042216,0.007807,-0.323374,0.239274,-0.171238,-0.051610,-0.085143,0.053878,-0.024637,0.036788,...,-0.028024,-0.056563,0.112036,-0.002028,-0.073680,-0.021632,-0.097797,0.187835,0.106588,0.096950
2,-0.092368,-0.189523,-0.009720,0.183215,-0.029319,-0.023563,-0.049403,-0.176174,0.098500,0.145629,...,0.094414,0.133662,0.143568,-0.100534,0.055244,-0.067328,-0.073239,0.133813,0.188375,-0.091567
3,-0.069223,0.120941,-0.256029,0.106836,-0.199290,-0.129381,0.045415,-0.080313,0.204904,0.049175,...,0.124354,-0.031129,0.090552,-0.103008,-0.142635,-0.183100,0.087854,0.279785,0.337483,0.008798
4,-0.084492,-0.263232,-0.093723,0.007081,0.005129,-0.178735,0.145290,-0.264792,-0.215830,0.267448,...,0.167543,0.006900,0.185153,0.171885,0.002973,-0.154298,-0.139211,-0.134581,0.106265,0.065847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,-0.092496,-0.161918,0.003437,-0.000338,-0.166313,0.000480,-0.029617,-0.036378,-0.172753,-0.019287,...,0.189887,0.086398,-0.027365,0.004651,-0.029130,-0.032107,-0.197571,-0.112846,0.156353,0.233766
397,-0.211835,-0.118695,-0.201009,0.076824,-0.137734,-0.019259,0.097436,-0.302849,-0.101496,-0.075175,...,0.010582,0.061602,-0.042718,-0.019891,-0.079696,-0.068846,-0.277174,0.194296,0.054312,-0.026135
398,0.097125,-0.312537,0.036275,0.334059,-0.105306,-0.130261,-0.055160,-0.418767,-0.419265,0.156148,...,-0.241456,0.164796,0.060799,-0.306069,0.421134,-0.226880,-0.390187,-0.001768,-0.145094,0.127011
399,-0.071388,-0.058593,-0.179957,0.154407,-0.176266,0.012906,-0.027642,-0.047768,0.063605,0.137224,...,0.031463,-0.054614,0.078948,-0.000712,0.001410,-0.121144,-0.231274,0.182183,0.064684,0.041597


In [13]:
df_ = df_vec.describe()
df_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
count,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0,...,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0,401.0
mean,-0.095547,-0.074184,-0.125072,0.171894,-0.133329,0.013321,0.030053,-0.148463,-0.044789,0.105181,...,0.088252,-0.013725,0.07987,0.019978,-0.014581,-0.11568,-0.101476,0.154656,0.103135,0.028297
std,0.114695,0.126309,0.116858,0.111393,0.104305,0.120401,0.110572,0.127969,0.12715,0.117445,...,0.116976,0.130968,0.118389,0.110495,0.11498,0.120257,0.10184,0.126808,0.100834,0.111167
min,-0.450379,-0.632769,-0.443189,-0.215559,-0.519985,-0.343233,-0.332102,-0.573074,-0.718343,-0.374328,...,-0.298984,-0.505446,-0.246274,-0.3306,-0.355333,-0.463338,-0.390187,-0.465604,-0.177589,-0.319669
25%,-0.168978,-0.143553,-0.204127,0.097889,-0.195826,-0.0688,-0.043687,-0.225703,-0.120792,0.035194,...,0.020884,-0.089395,-0.008355,-0.056401,-0.091261,-0.194381,-0.17252,0.089996,0.041636,-0.041916
50%,-0.097062,-0.064573,-0.130745,0.162787,-0.134583,0.012159,0.018698,-0.144323,-0.041444,0.114958,...,0.096158,-0.014218,0.08536,0.015907,-0.014986,-0.114595,-0.101267,0.157398,0.106265,0.031263
75%,-0.018767,0.001076,-0.048366,0.242841,-0.075525,0.097859,0.105859,-0.063553,0.033848,0.186209,...,0.164557,0.056644,0.155184,0.091832,0.05073,-0.038328,-0.040372,0.231205,0.16663,0.099613
max,0.405031,0.314218,0.35687,0.514291,0.207358,0.365577,0.46967,0.333786,0.310687,0.450027,...,0.553675,0.49143,0.599578,0.471221,0.549405,0.21092,0.218374,0.471699,0.36546,0.44684


In [14]:
df_mean = df_.loc['mean']
df_mean

0     -0.095547
1     -0.074184
2     -0.125072
3      0.171894
4     -0.133329
         ...   
295   -0.115680
296   -0.101476
297    0.154656
298    0.103135
299    0.028297
Name: mean, Length: 300, dtype: float64

In [15]:
df_std = df_.loc['std']
df_std

0      0.114695
1      0.126309
2      0.116858
3      0.111393
4      0.104305
         ...   
295    0.120257
296    0.101840
297    0.126808
298    0.100834
299    0.111167
Name: std, Length: 300, dtype: float64

In [18]:
path

<module 'ntpath' from 'C:\\anaconda3\\envs\\fasttext\\lib\\ntpath.py'>

In [17]:
print(path.exists)

<function exists at 0x000001E10517D678>


In [16]:
vec_name =  'sentences_vec.npy'
if not path.exists():
    print('true')

TypeError: exists() missing 1 required positional argument: 'path'

In [None]:
ｃｃ

In [None]:
#read all txts  

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.preprocessing import StandardScaler

import nltk

In [None]:
#homePC
#path_0 = Path(r'C:\Users\SI\Python_ML\python_NLTK\NLTK_')
#officePC
path_0 = Path(r'C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_')


author = ['Poe','Twain']
type_ = ['letter', 'story']
test = ['test']

choice = [0,1]

index_i = 0
index_j = 1
path = str(path_0) + str('/') + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
#path = str(path_0) + str('/') + str(test[0])
print(path)

In [None]:
#new_dir_path_csv = 'csv/' + str(test[choice[index_i]]) + str('_') 
#new_dir_path_png = 'png/' + str(test[choice[index_i]]) + str('_') 

#author_type = str(test[choice[index_i]]) + str('_')

new_dir_path_csv = 'csv/' + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
new_dir_path_png = 'png/' + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])

author_type = str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])

new_dir_path_png_top = 'png/' + author_type + '/top_ranking'
new_dir_path_png_top_no_punc = 'png/' + author_type + '/top_ranking_no_punc'
new_dir_path_png_top_no_punc_no_stopwords = 'png/' + author_type + '/top_ranking_no_punc_no_stopwords'
new_dir_path_png_dispersion_plot = 'png/' + author_type + '/dispersion_plot'
new_dir_path_png_biagram_plot  = 'png/' + author_type + '/biagram_plot'

def my_makedirs(path):
    if not os.path.isdir(path):
        os.makedirs(path)

my_makedirs(new_dir_path_csv)
my_makedirs(new_dir_path_png)
my_makedirs(new_dir_path_png_top)
my_makedirs(new_dir_path_png_top_no_punc)
my_makedirs(new_dir_path_png_top_no_punc_no_stopwords)

my_makedirs(new_dir_path_png_dispersion_plot)
my_makedirs(new_dir_path_png_biagram_plot)

new_dir_path_csv, new_dir_path_png, new_dir_path_png_top, new_dir_path_png_top_no_punc, new_dir_path_png_top_no_punc_no_stopwords,new_dir_path_png_dispersion_plot, new_dir_path_png_biagram_plot

In [None]:
path = Path(path)
path
path.glob('*.txt')
list(path.glob('*.txt'))
print('lists:', path.glob('*.txt'))
#list path
path = list(path.glob('*.txt'))
path

In [None]:
data = pd.read_table(path[0], encoding='cp932')
data

In [None]:
data.columns[0]

In [None]:
data_list = data[data.columns[0]].values.tolist()
len(data_list),data_list

In [None]:
data_list[0]

In [None]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

para = data_list[0]

punkt_params = PunktParameters()
punkt_params.abbrev_types = set(['Mr', 'Mrs', 'LLC'])
tokenizer = PunktSentenceTokenizer(punkt_params)
tokens = tokenizer.tokenize(para)

for t in tokens:
    print (t, "\n")

In [None]:
#break the sentences into sub-sentences

In [None]:
import re
from contextlib import redirect_stdout
from io import StringIO

example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'

def token_to_sentence(str):
    f = StringIO()
    with redirect_stdout(f):
        regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
        regex_of_sentence = [x for x in regex_of_sentence if x is not '']
        for i in regex_of_sentence:
            print(i)
        first_step_to_sentence = (f.getvalue()).split('\n')
    g = StringIO()
    with redirect_stdout(g):
        for i in first_step_to_sentence:
            try:
                regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                print(regex_to_clear_sentence.group(1))
            except:
                print(i)
        sentence = (g.getvalue()).split('\n')
    return sentence

def token_to_words(str):
    f = StringIO()
    with redirect_stdout(f):
        for i in str:
            regex_of_word = re.findall('([\w]{0,})', i)
            regex_of_word = [x for x in regex_of_word if x is not '']
            for word in regex_of_word:
                print(regex_of_word)
        words = (f.getvalue()).split('\n')

In [None]:
sentence = token_to_sentence(data_list[0])
words = token_to_words(data_list[0])

In [None]:
sentence, words

In [None]:
df_sentence = pd.DataFrame(sentence)
df_sentence

In [None]:
df_sentence = df_sentence.rename(columns={0: 'headline_text'})
df_sentence

In [None]:
df_sentence = df_sentence[0:5]
df_sentence

In [None]:
col = pd.DataFrame(df_sentence).columns[0]
col

In [None]:
series = pd.DataFrame(df_sentence)['headline_text']
series, type(series), series.shape

In [None]:
df_sentence = preprocess(pd.DataFrame(series))

In [None]:
sentences, vec = get_word_vector()
print(vec.shape)

In [None]:
vec,sentences

In [None]:
df_sentence = preprocess(pd.DataFrame(series))

In [None]:
print(path.exists)

In [None]:
np.random.seed(123)
# ランダムに50個の記事の題名を取得
#data = pd.read_csv('./abcnews-date-text.csv')

rand_index = np.random.randint(0, data.shape[0], 500)
data = data.iloc[rand_index, 1]
print('raw data')
print(data.head())

# sent = ' '.join(list(data))
# words = nltk.word_tokenize(sent)
# tags = nltk.pos_tag(words)
# tags = sorted(list(set([tag for word, tag in tags])))
# for i in tags:
    # print(nltk.help.upenn_tagset(i))
data = preprocess(pd.DataFrame(data))

sentences, vec = get_word_vector()
print(vec.shape)
#get_similar_sentence(data.iloc[0, 1], data, sentences, vec, 5)

In [None]:
sentence_list = []
for index, sentence in enumerate(data_list):
    print(index, sentence)
    sentence_list.extend(sentence)

In [None]:
df_concat = pd.concat([df_test_list[j] for j in range(len(detector_mode))], axis=axis_)

In [None]:
#break the story into sentences

In [None]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

para = '''Either the well was very deep, or she fell very slowly, for she had plenty
of time as she went down to look about her and to wonder what was going to happen
next. First, she tried to look down and make out what she was coming to, but it was
too dark to see anything; then she looked at the sides of the well, and noticed
that they were filled with cupboards and book-shelves; here and there she saw maps
and pictures Mr...hung upon pegs. She took down a jar from one of the shelves as she
passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was
empty: she did not like to drop the jar for fear of killing somebody, so managed to
put it into one of the cupboards as she fell past it.'''

punkt_params = PunktParameters()
#punkt_params.abbrev_types = set(['Mr', 'Mrs', 'LLC'])
tokenizer = PunktSentenceTokenizer(punkt_params)
tokens = tokenizer.tokenize(para)

for t in tokens:
    print (t, "\n")

In [None]:
# ランダムに50個の記事の題名を取得
data = pd.read_csv('./abcnews-date-text.csv')
data

In [None]:
np.random.seed(123)
# ランダムに50個の記事の題名を取得
data = pd.read_csv('./abcnews-date-text.csv')

rand_index = np.random.randint(0, data.shape[0], 500)
data = data.iloc[rand_index, 1]
print('raw data')
print(data.head())

# sent = ' '.join(list(data))
# words = nltk.word_tokenize(sent)
# tags = nltk.pos_tag(words)
# tags = sorted(list(set([tag for word, tag in tags])))
# for i in tags:
    # print(nltk.help.upenn_tagset(i))
data = preprocess(pd.DataFrame(data))

sentences, vec = get_word_vector()
print(vec.shape)
#get_similar_sentence(data.iloc[0, 1], data, sentences, vec, 5)

In [None]:
# feature vector => SVM or clustering