In [1]:
import pandas as pd
import numpy as np
import nltk
from os import path
import re
import libs as ft
from sklearn.metrics.pairwise import cosine_similarity
#from pyfasttext import FastText
#from gensim.models.wrappers import FastText
import fasttext as ft


import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters


In [3]:
def cleansing(x, drop_tag, tag_pos, lemmatizer):
    """
    いらない品詞を除外し，レンマ化して返す．apply関数内で使用，

    Args:
        x (Series): apply関数で呼び出されるSeries
        drop_tag (list): いらない品詞リスト(nltk)
        tag_pos (dict): key -> tag, value -> pos. レンマ化の精度向上に使用．
        lemmatizer (nltk.stem.WordNetLemmatizer): lemmatizer

    Returns:
        (str): output sentence
    """
    words = [word for word in x['headline_text'].split(' ') if word != '']  # 空文字入るとエラーになる
    tags = nltk.pos_tag(words)  # 品詞を取得
    words = [(word, tag_pos[tag]) for word, tag in tags if tag not in drop_tag]  # いらない品詞を除外
    print('words:',words)
    words = [lemmatizer.lemmatize(word, pos=pos) for word, pos in words]
    sentence = ' '.join(words)  # 連結
    return sentence

In [4]:
def preprocess(data):
    """
    前処理の関数．

    Args:
        data (DataFrame): input dataset

    Retruns:
        (DataFrame): output dataset
    """
    # まずは，いらない品詞を落とし，レンマ化する．
    # その後，階層クラスタリングのときに使う用のcsvファイルとモデル学習用のtxtファイルを出力する．
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # いらない品詞
    drop_tag = ['$', 'CC', 'CD', 'DT', 'IN', 'MD', 'POS', 'PRP', 'PRP$', 'RP', 'TO' , 'WP', 'WRB','WDT','PDT']
    # 品詞とpos(lemma用)の変換辞書
    tag_pos = {'FW': 'n', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'RB': 'r', 'RBR': 'r', 'VB': 'v',
               'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 'RBS': 'r',}

    #data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer,)))
    data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer)))

    print('after drop and lemmatization')
    print(data.head())
    data.to_csv('data.csv', sep='\t', index=False)
    data['preprocessed'].to_csv('text.txt', index=False)
    return data

In [5]:
def get_word_vector(data_name='text.txt', model_name='./pretrained_model/model.bin'):
    """
    fasttextベースで分散表現を取得する関数．これも見てわかると思うので引数は省略．

    Returns:
        (list of list): 単語リストのリスト．[['word_0_0', 'word_0_1'], ['word_1_0', 'word_1_1', 'word_1_2'], ...]みたいな
        (array): 分散表現 次元=(文章数×分散表現の次元数)
    """
    sentences = []
    with open(data_name, mode='r') as f:
        for line in f.readlines():
            line = re.sub('\n', '', line)
            sentences.append(line.split(' '))

    # modelが12GBくらいメモリを食うので終わったら開放する．
    vec_name =  'sentences_vc.npy'
    if not path.exists(vec_name):
        #model = FastText.load_fasttext_format(model_name)
        model = ft.load_model(model_name)
        dim = model.get_dimension()
        sentences_vec = np.zeros((dim,))

        for words in sentences:
            vec = np.zeros((dim,))
            for word in words:
                if model.get_word_id(word) == -1:
                    print('this word does not exists in corpus: %s at %s' % (word, words))
                vec = np.vstack((vec, model.get_word_vector(word)))
            vec = vec[1:, :].mean(axis=0)
            sentences_vec = np.vstack((sentences_vec, vec))
        sentences_vec = sentences_vec[1:, :]
        del model

        np.save(vec_name, sentences_vec)
    else:
        sentences_vec = np.load(vec_name)
    return sentences, sentences_vec

In [6]:
import re
from contextlib import redirect_stdout
from io import StringIO

example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'

def token_to_sentence(str):
    f = StringIO()
    with redirect_stdout(f):
        regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
        regex_of_sentence = [x for x in regex_of_sentence if x is not '']
        for i in regex_of_sentence:
            print(i)
        first_step_to_sentence = (f.getvalue()).split('\n')
    g = StringIO()
    with redirect_stdout(g):
        for i in first_step_to_sentence:
            try:
                regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                print(regex_to_clear_sentence.group(1))
            except:
                print(i)
        sentence = (g.getvalue()).split('\n')
    return sentence

def token_to_words(str):
    f = StringIO()
    with redirect_stdout(f):
        for i in str:
            regex_of_word = re.findall('([\w]{0,})', i)
            regex_of_word = [x for x in regex_of_word if x is not '']
            for word in regex_of_word:
                print(regex_of_word)
        words = (f.getvalue()).split('\n')

In [7]:
#homePC
#path_0 = Path(r'C:\Users\SI\Python_ML\python_NLTK\NLTK_')
#officePC
path_0 = Path(r'C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_')


author = ['Poe','Twain']
type_ = ['letter', 'story']
test = ['test']

choice = [0,1]

index_i = 0
index_j = 1
path_file = str(path_0) + str('/') + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
#path = str(path_0) + str('/') + str(test[0])
print(path_file)

C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_/Poe-story


In [8]:
path_file = Path(path_file)
path_file
path_file.glob('*.txt')
list(path_file.glob('*.txt'))
print('lists:', path_file.glob('*.txt'))
#list path
path_list = list(path_file.glob('*.txt'))
path_list

lists: <generator object Path.glob at 0x0000021207FD76C8>


[WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-001.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-002.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-003.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-004.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-005.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-006.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-007.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-008.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-009.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-010.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLT

In [9]:
data = pd.read_table(path_list[0], encoding='cp932')
data

Unnamed: 0,THE SWISS BELL-RINGERS
0,"One of the regular allies of the Mirror, a man..."
1,The Swiss Bell-ringers. The readers of the Mi...


In [10]:
data_list = data[data.columns[0]].values.tolist()
len(data_list),data_list

(2,
 ['One of the regular allies of the Mirror, a man of a very humorous critical vein, has taken it into his head to prove the Swiss Bell-ringers to be an automaton. We have argued the point with him till we are tired, and have at last sent to beg a copy of their board-bill with affidavits that their stomachs are not wooden and do kindly entertain rolls and sausages. While these documents are coming, we publish the skeleton of our friend’s hypothesis: ',
  'The Swiss Bell-ringers.  The readers of the Mirror scarce need be told,  as most of them have seen and heard for themselves,  that the Swiss Bell-ringers enter, to the number of seven, white-plumed and fancifully costumed, and each armed with four or five hand-bells of various sizes, which they deposit on a cushioned table before them, retaining one in each hand, which they are continually changing for others in their armory, putting down and taking up with the rapidity of jugglers, and all the while ringing the changes upon them w

In [11]:
sentence = token_to_sentence(data_list[1])

sentence

['Swiss Bell',
 'ringers',
 ' The readers of the Mirror scarce need be told',
 ' as most of them have seen and heard for themselves',
 ' that the Swiss Bell',
 'enter',
 'to the number of seven',
 'white',
 'and fancifully costumed',
 'and each armed with four or five hand',
 'of various sizes',
 'which they deposit on a cushioned table before them',
 'retaining one in each hand',
 'which they are continually changing for others in their armory',
 'putting down and taking up with the rapidity of jugglers',
 'and all the while ringing the changes upon them with a delicate harmony and precision',
 'which are as perfect in a symphony of Haydn as in ',
 'Lucy Long',
 'writer alludes to them now only to say',
 'that they may be heard again to',
 'night',
 'and to correct the erroneous but common idea that these Bell',
 'are real living beings',
 'The writer is firmly convinced that they are ingenious pieces of mechanism',
 'contrived on the principle of Maelzel',
 'Automaton Trumpeter and P

In [12]:
df_sentence = pd.DataFrame(sentence)
df_sentence = df_sentence.rename(columns={0: 'headline_text'})
df_sentence

Unnamed: 0,headline_text
0,Swiss Bell
1,ringers
2,The readers of the Mirror scarce need be told
3,as most of them have seen and heard for thems...
4,that the Swiss Bell
5,enter
6,to the number of seven
7,white
8,and fancifully costumed
9,and each armed with four or five hand


In [13]:
df_sentence = df_sentence[:]
df_sentence

Unnamed: 0,headline_text
0,Swiss Bell
1,ringers
2,The readers of the Mirror scarce need be told
3,as most of them have seen and heard for thems...
4,that the Swiss Bell
5,enter
6,to the number of seven
7,white
8,and fancifully costumed
9,and each armed with four or five hand


In [14]:
col = pd.DataFrame(df_sentence).columns[0]
col

'headline_text'

In [15]:
series = pd.DataFrame(df_sentence)['headline_text']
series, type(series), series.shape

(0                                            Swiss Bell
 1                                               ringers
 2         The readers of the Mirror scarce need be told
 3      as most of them have seen and heard for thems...
 4                                   that the Swiss Bell
 5                                                 enter
 6                                to the number of seven
 7                                                 white
 8                               and fancifully costumed
 9                 and each armed with four or five hand
 10                                     of various sizes
 11    which they deposit on a cushioned table before...
 12                           retaining one in each hand
 13    which they are continually changing for others...
 14    putting down and taking up with the rapidity o...
 15    and all the while ringing the changes upon the...
 16    which are as perfect in a symphony of Haydn as...
 17                            

In [16]:
df_sentence = preprocess(pd.DataFrame(series))

words: [('Swiss', 'a'), ('Bell', 'n')]
words: [('ringers', 'n')]
words: [('readers', 'n'), ('Mirror', 'n'), ('scarce', 'n'), ('need', 'n'), ('be', 'v'), ('told', 'v')]
words: [('most', 'a'), ('have', 'v'), ('seen', 'v'), ('heard', 'v')]
words: [('Swiss', 'n'), ('Bell', 'n')]
words: [('enter', 'n')]
words: [('number', 'n')]
words: [('white', 'a')]
words: [('fancifully', 'r'), ('costumed', 'v')]
words: [('armed', 'v'), ('hand', 'n')]
words: [('various', 'a'), ('sizes', 'n')]
words: [('deposit', 'v'), ('cushioned', 'v'), ('table', 'n')]
words: [('retaining', 'v'), ('hand', 'n')]
words: [('are', 'v'), ('continually', 'r'), ('changing', 'v'), ('others', 'n'), ('armory', 'n')]
words: [('putting', 'v'), ('taking', 'v'), ('rapidity', 'n'), ('jugglers', 'n')]
words: [('while', 'n'), ('ringing', 'v'), ('changes', 'n'), ('delicate', 'a'), ('harmony', 'n'), ('precision', 'n')]
words: [('are', 'v'), ('perfect', 'n'), ('symphony', 'n'), ('Haydn', 'n')]
words: [('Lucy', 'n'), ('Long', 'n')]
words: [(

In [17]:
sentences, vec = get_word_vector()
print(vec.shape, sentences)

this word does not exists in corpus: Swiss at ['Swiss', 'Bell']
this word does not exists in corpus: Bell at ['Swiss', 'Bell']
this word does not exists in corpus: Mirror at ['reader', 'Mirror', 'scarce', 'need', 'be', 'tell']
this word does not exists in corpus: Swiss at ['Swiss', 'Bell']
this word does not exists in corpus: Bell at ['Swiss', 'Bell']
this word does not exists in corpus: Haydn at ['be', 'perfect', 'symphony', 'Haydn']
this word does not exists in corpus: Lucy at ['Lucy', 'Long']
this word does not exists in corpus: Long at ['Lucy', 'Long']
this word does not exists in corpus: Bell at ['correct', 'erroneous', 'common', 'idea', 'Bell']
this word does not exists in corpus: Maelzel at ['contrive', 'principle', 'Maelzel']
this word does not exists in corpus: Automaton at ['Automaton', 'Trumpeter', 'Piano']
this word does not exists in corpus: Trumpeter at ['Automaton', 'Trumpeter', 'Piano']
this word does not exists in corpus: Piano at ['Automaton', 'Trumpeter', 'Piano']
th

In [18]:
vec

array([[-0.02519209,  0.10035963, -0.22098465, ...,  0.13799268,
         0.22382051,  0.06539109],
       [ 0.09067306, -0.02329026, -0.23875067, ...,  0.18957086,
        -0.15946393, -0.18525379],
       [-0.02001002,  0.22038059, -0.2904512 , ...,  0.17952715,
        -0.19910479, -0.00172993],
       ...,
       [-0.18813082, -0.0374893 , -0.26083215, ...,  0.16157034,
         0.10818712,  0.01412216],
       [-0.13694926, -0.23902047, -0.10387862, ...,  0.04191428,
         0.41530865, -0.25459206],
       [-0.13694926, -0.23902047, -0.10387862, ...,  0.04191428,
         0.41530865, -0.25459206]])

In [19]:
print(path.exists)

<function exists at 0x000002127F23D678>
