In [1]:
import pandas as pd
import numpy as np
import nltk
from os import path
import re
import libs as ft
from sklearn.metrics.pairwise import cosine_similarity
#from pyfasttext import FastText
#from gensim.models.wrappers import FastText
import fasttext as ft


import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters


In [3]:
def cleansing(x, drop_tag, tag_pos, lemmatizer):
    """
    いらない品詞を除外し，レンマ化して返す．apply関数内で使用，

    Args:
        x (Series): apply関数で呼び出されるSeries
        drop_tag (list): いらない品詞リスト(nltk)
        tag_pos (dict): key -> tag, value -> pos. レンマ化の精度向上に使用．
        lemmatizer (nltk.stem.WordNetLemmatizer): lemmatizer

    Returns:
        (str): output sentence
    """
    words = [word for word in x['headline_text'].split(' ') if word != '']  # 空文字入るとエラーになる
    tags = nltk.pos_tag(words)  # 品詞を取得
    words = [(word, tag_pos[tag]) for word, tag in tags if tag not in drop_tag]  # いらない品詞を除外
    print('words:',words)
    words = [lemmatizer.lemmatize(word, pos=pos) for word, pos in words]
    sentence = ' '.join(words)  # 連結
    return sentence

In [4]:
def preprocess(data):
    """
    前処理の関数．

    Args:
        data (DataFrame): input dataset

    Retruns:
        (DataFrame): output dataset
    """
    # まずは，いらない品詞を落とし，レンマ化する．
    # その後，階層クラスタリングのときに使う用のcsvファイルとモデル学習用のtxtファイルを出力する．
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # いらない品詞
    drop_tag = ['$', 'CC', 'CD', 'DT', 'IN', 'MD', 'POS', 'PRP', 'PRP$', 'RP', 'TO' , 'WP', 'WRB','WDT','PDT']
    # 品詞とpos(lemma用)の変換辞書
    tag_pos = {'FW': 'n', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'RB': 'r', 'RBR': 'r', 'VB': 'v',
               'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 'RBS': 'r',}

    #data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer,)))
    data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer)))

    print('after drop and lemmatization')
    print(data.head())
    data.to_csv('data.csv', sep='\t', index=False)
    #data['preprocessed'].to_csv('text.txt', index=False)
    data['preprocessed'].to_csv('text.txt', index=False)
    return data

In [5]:
def get_word_vector(data_name='text.txt', model_name='./pretrained_model/model.bin'):
    """
    fasttextベースで分散表現を取得する関数．これも見てわかると思うので引数は省略．

    Returns:
        (list of list): 単語リストのリスト．[['word_0_0', 'word_0_1'], ['word_1_0', 'word_1_1', 'word_1_2'], ...]みたいな
        (array): 分散表現 次元=(文章数×分散表現の次元数)
    """
    sentences = []
    with open(data_name, mode='r') as f:
        for line in f.readlines():
            line = re.sub('\n', '', line)
            sentences.append(line.split(' '))

    # modelが12GBくらいメモリを食うので終わったら開放する．
    vec_name =  'sentences_vc.npy'
    #if not path.exists(vec_name):
        #model = FastText.load_fasttext_format(model_name)
    model = ft.load_model(model_name)
    dim = model.get_dimension()
    sentences_vec = np.zeros((dim,))

    for words in sentences:
        vec = np.zeros((dim,))
        for word in words:
            if model.get_word_id(word) == -1:
                print('this word does not exists in corpus: %s at %s' % (word, words))
            vec = np.vstack((vec, model.get_word_vector(word)))
        vec = vec[1:, :].mean(axis=0)
        sentences_vec = np.vstack((sentences_vec, vec))
    sentences_vec = sentences_vec[1:, :]
    del model

    np.save(vec_name, sentences_vec)
    #else:
        #sentences_vec = np.load(vec_name)
    return sentences, sentences_vec

In [6]:
import re
from contextlib import redirect_stdout
from io import StringIO

example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'

def token_to_sentence(str):
    f = StringIO()
    with redirect_stdout(f):
        regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
        regex_of_sentence = [x for x in regex_of_sentence if x is not '']
        for i in regex_of_sentence:
            print(i)
        first_step_to_sentence = (f.getvalue()).split('\n')
    g = StringIO()
    with redirect_stdout(g):
        for i in first_step_to_sentence:
            try:
                regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                print(regex_to_clear_sentence.group(1))
            except:
                print(i)
        sentence = (g.getvalue()).split('\n')
    return sentence

def token_to_words(str):
    f = StringIO()
    with redirect_stdout(f):
        for i in str:
            regex_of_word = re.findall('([\w]{0,})', i)
            regex_of_word = [x for x in regex_of_word if x is not '']
            for word in regex_of_word:
                print(regex_of_word)
        words = (f.getvalue()).split('\n')

In [7]:
#homePC
#path_0 = Path(r'C:\Users\SI\Python_ML\python_NLTK\NLTK_')
#officePC
path_0 = Path(r'C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_')


author = ['Poe','Twain']
type_ = ['letter', 'story']
test = ['test']

choice = [0,1]

index_i = 0
index_j = 1
path_file = str(path_0) + str('/') + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
#path = str(path_0) + str('/') + str(test[0])
print(path_file)

C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_/Poe-story


In [8]:
path_file = Path(path_file)
path_file
path_file.glob('*.txt')
list(path_file.glob('*.txt'))
print('lists:', path_file.glob('*.txt'))
#list path
path_list = list(path_file.glob('*.txt'))
path_list

lists: <generator object Path.glob at 0x0000023AA92876C8>


[WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-001.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-002.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-003.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-004.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-005.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-006.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-007.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-008.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-009.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-010.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLT

In [9]:
data = pd.read_table(path_list[0], encoding='cp932')
data

Unnamed: 0,THE SWISS BELL-RINGERS
0,"One of the regular allies of the Mirror, a man..."
1,We have argued the point with him till we are ...
2,"While these documents are coming, we publish t..."
3,"The readers of the Mirror scarce need be told,..."
4,"The writer alludes to them now only to say, th..."
5,"For this reason, too, they arrange so carefull..."
6,Their very number shows that they were contriv...


In [10]:
data_list = data[data.columns[0]].values.tolist()
len(data_list),data_list

(7,
 ['One of the regular allies of the Mirror, a man of a very humorous critical vein, has taken it into his head to prove the Swiss Bell-ringers to be an automaton. ',
  'We have argued the point with him till we are tired, and have at last sent to beg a copy of their board-bill with affidavits that their stomachs are not wooden and do kindly entertain rolls and sausages.',
  "While these documents are coming, we publish the skeleton of our friend's hypothesis: The Swiss Bell-ringers. ",
  'The readers of the Mirror scarce need be told,  as most of them have seen and heard for themselves,  that the Swiss Bell-ringers enter, to the number of seven, white-plumed and fancifully costumed, and each armed with four or five hand-bells of various sizes, which they deposit on a cushioned table before them, retaining one in each hand, which they are continually changing for others in their armory, putting down and taking up with the rapidity of jugglers, and all the while ringing the changes u

In [11]:
# divided into long sentences
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

para = data_list[1]

punkt_params = PunktParameters()
punkt_params.abbrev_types = set(['Mr', 'Mrs', 'LLC','Miss'])
tokenizer = PunktSentenceTokenizer(punkt_params)
tokens = tokenizer.tokenize(para)

for t in tokens:
    print (t, "\n")
    
df_sentence = pd.DataFrame(tokens)
df_sentence = df_sentence.rename(columns={0: 'headline_text'})
df_sentence

We have argued the point with him till we are tired, and have at last sent to beg a copy of their board-bill with affidavits that their stomachs are not wooden and do kindly entertain rolls and sausages. 



Unnamed: 0,headline_text
0,We have argued the point with him till we are ...


In [12]:
df_sentence = df_sentence[:]
df_sentence

Unnamed: 0,headline_text
0,We have argued the point with him till we are ...


In [13]:
col = pd.DataFrame(df_sentence).columns[0]
col

'headline_text'

In [14]:
series = pd.DataFrame(df_sentence)['headline_text']
series, type(series), series.shape

(0    We have argued the point with him till we are ...
 Name: headline_text, dtype: object,
 pandas.core.series.Series,
 (1,))

In [15]:
df_sentence = preprocess(pd.DataFrame(series))
sentences, vec = get_word_vector()
print(vec.shape, sentences)

words: [('have', 'v'), ('argued', 'v'), ('point', 'n'), ('are', 'v'), ('tired,', 'a'), ('have', 'v'), ('last', 'a'), ('sent', 'n'), ('beg', 'v'), ('copy', 'n'), ('board-bill', 'n'), ('affidavits', 'n'), ('stomachs', 'n'), ('are', 'v'), ('not', 'r'), ('wooden', 'a'), ('do', 'v'), ('kindly', 'r'), ('entertain', 'v'), ('rolls', 'n'), ('sausages.', 'n')]
after drop and lemmatization
                                       headline_text  \
0  We have argued the point with him till we are ...   

                                        preprocessed  
0  have argue point be tired, have last sent beg ...  
this word does not exists in corpus: "have at ['"have', 'argue', 'point', 'be', 'tired,', 'have', 'last', 'sent', 'beg', 'copy', 'board-bill', 'affidavit', 'stomach', 'be', 'not', 'wooden', 'do', 'kindly', 'entertain', 'roll', 'sausages."']
this word does not exists in corpus: tired, at ['"have', 'argue', 'point', 'be', 'tired,', 'have', 'last', 'sent', 'beg', 'copy', 'board-bill', 'affidavit

In [16]:
print(vec.shape, vec)

(2, 300) [[-2.51920950e-02  1.00359626e-01 -2.20984653e-01  2.16720119e-01
  -3.75728428e-01 -1.36016786e-01  4.69669819e-01 -3.95488858e-01
   2.93971002e-01 -3.41169566e-01 -1.44786894e-01  7.74884075e-02
  -1.14528291e-01 -9.62964892e-02  2.69352525e-01 -5.01342714e-01
   1.66222557e-01  2.17696428e-01  1.19996667e-01  2.15265140e-01
   2.96215922e-01 -2.80547261e-01 -1.84577957e-01  1.52415022e-01
  -3.04059654e-01  1.11938834e-01  9.00849104e-02  6.92548826e-02
  -9.44551229e-02  1.00656733e-01  1.44979969e-01  5.78513265e-01
   2.69297175e-02  2.76256263e-01  5.39283529e-02 -7.12620094e-02
  -7.91453570e-02  3.90956819e-01  1.14514515e-01 -7.77416676e-02
   9.29599535e-03 -3.71775478e-01  7.99704939e-02 -3.70187163e-02
   9.73515736e-04 -3.15918356e-01  4.33003038e-01 -3.00064474e-01
  -4.21907753e-02  2.79742539e-01  1.89448118e-01 -5.66520572e-01
  -2.27706507e-01  7.20087960e-02  2.75513172e-01 -1.88710138e-01
   9.33695957e-02 -2.30479017e-01 -2.49538198e-01  1.21192001e-01
 

In [17]:
print(path.exists)

<function exists at 0x0000023A9EDDD678>


In [18]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

para = '''Either the well was very deep, or she fell very slowly, for she had plenty
of time as she went down to look about her and to wonder what was going to happen
next. First, she tried to look down and make out what she was coming to, but it was
too dark to see anything; then she looked at the sides of the well, and noticed
that they were filled with cupboards and book-shelves; here and there she saw maps
and pictures Mr...hung upon pegs. She took down a jar from one of the shelves as she
passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was
empty: she did not like to drop the jar for fear of killing somebody, so managed to
put it into one of the cupboards as she fell past it.'''

punkt_params = PunktParameters()
#punkt_params.abbrev_types = set(['Mr', 'Mrs', 'LLC'])
tokenizer = PunktSentenceTokenizer(punkt_params)
tokens = tokenizer.tokenize(para)

for t in tokens:
    print (t, "\n")

Either the well was very deep, or she fell very slowly, for she had plenty
of time as she went down to look about her and to wonder what was going to happen
next. 

First, she tried to look down and make out what she was coming to, but it was
too dark to see anything; then she looked at the sides of the well, and noticed
that they were filled with cupboards and book-shelves; here and there she saw maps
and pictures Mr...hung upon pegs. 

She took down a jar from one of the shelves as she
passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was
empty: she did not like to drop the jar for fear of killing somebody, so managed to
put it into one of the cupboards as she fell past it. 

