In [1]:
import pandas as pd
import numpy as np
import nltk
from os import path
import re
import libs as ft
from sklearn.metrics.pairwise import cosine_similarity
#from pyfasttext import FastText
#from gensim.models.wrappers import FastText
import fasttext as ft


import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def cleansing(x, drop_tag, tag_pos, lemmatizer):
    """
    いらない品詞を除外し，レンマ化して返す．apply関数内で使用，

    Args:
        x (Series): apply関数で呼び出されるSeries
        drop_tag (list): いらない品詞リスト(nltk)
        tag_pos (dict): key -> tag, value -> pos. レンマ化の精度向上に使用．
        lemmatizer (nltk.stem.WordNetLemmatizer): lemmatizer

    Returns:
        (str): output sentence
    """
    words = [word for word in x['headline_text'].split(' ') if word != '']  # 空文字入るとエラーになる
    tags = nltk.pos_tag(words)  # 品詞を取得
    words = [(word, tag_pos[tag]) for word, tag in tags if tag not in drop_tag]  # いらない品詞を除外
    words = [lemmatizer.lemmatize(word, pos=pos) for word, pos in words]
    sentence = ' '.join(words)  # 連結
    return sentence


In [3]:
def preprocess(data):
    """
    前処理の関数．

    Args:
        data (DataFrame): input dataset

    Retruns:
        (DataFrame): output dataset
    """
    # まずは，いらない品詞を落とし，レンマ化する．
    # その後，階層クラスタリングのときに使う用のcsvファイルとモデル学習用のtxtファイルを出力する．
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # いらない品詞
    drop_tag = ['$', 'CC', 'CD', 'DT', 'IN', 'MD', 'POS', 'PRP', 'PRP$', 'RP', 'TO' , 'WP', 'WRB']
    # 品詞とpos(lemma用)の変換辞書
    tag_pos = {'FW': 'n', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'RB': 'r', 'RBR': 'r', 'VB': 'v',
               'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 'RBS': 'r',}

    #data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer,)))
    data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer)))

    print('after drop and lemmatization')
    print(data.head())
    data.to_csv('data.csv', sep='\t', index=False)
    data['preprocessed'].to_csv('text.txt', index=False)
    return data

In [4]:
def get_word_vector(data_name='text.txt', model_name='./pretrained_model/model.bin'):
    """
    fasttextベースで分散表現を取得する関数．これも見てわかると思うので引数は省略．

    Returns:
        (list of list): 単語リストのリスト．[['word_0_0', 'word_0_1'], ['word_1_0', 'word_1_1', 'word_1_2'], ...]みたいな
        (array): 分散表現 次元=(文章数×分散表現の次元数)
    """
    sentences = []
    with open(data_name, mode='r') as f:
        for line in f.readlines():
            line = re.sub('\n', '', line)
            sentences.append(line.split(' '))

    # modelが12GBくらいメモリを食うので終わったら開放する．
    vec_name =  'sentences_vec.npy'
    if not path.exists(vec_name):
        #model = FastText.load_fasttext_format(model_name)
        model = ft.load_model(model_name)
        dim = model.get_dimension()
        sentences_vec = np.zeros((dim,))

        for words in sentences:
            vec = np.zeros((dim,))
            for word in words:
                if model.get_word_id(word) == -1:
                    print('this word does not exists in corpus: %s at %s' % (word, words))
                vec = np.vstack((vec, model.get_word_vector(word)))
            vec = vec[1:, :].mean(axis=0)
            sentences_vec = np.vstack((sentences_vec, vec))
        sentences_vec = sentences_vec[1:, :]
        del model

        np.save(vec_name, sentences_vec)
    else:
        sentences_vec = np.load(vec_name)
    return sentences, sentences_vec

In [5]:
if __name__ == '__main__':
    np.random.seed(123)
    # ランダムに50個の記事の題名を取得
    data = pd.read_csv('./abcnews-date-text.csv')
    rand_index = np.random.randint(0, data.shape[0], 500)
    data = data.iloc[rand_index, 1]
    print('raw data')
    print(data.head())

    # sent = ' '.join(list(data))
    # words = nltk.word_tokenize(sent)
    # tags = nltk.pos_tag(words)
    # tags = sorted(list(set([tag for word, tag in tags])))
    # for i in tags:
        # print(nltk.help.upenn_tagset(i))
    data = preprocess(pd.DataFrame(data))

    sentences, vec = get_word_vector()
    print(vec.shape)
    #get_similar_sentence(data.iloc[0, 1], data, sentences, vec, 5)

raw data
773630                   holman heroics save socceroos skins
277869        cultural background affects ones health report
28030                row brews over radiation therapy delays
1066306    liberal mp isobel redmond to retire at 2018 st...
194278       waste plan review to consider recycling options
Name: headline_text, dtype: object
after drop and lemmatization
                                             headline_text  \
773630                 holman heroics save socceroos skins   
277869      cultural background affects ones health report   
28030              row brews over radiation therapy delays   
1066306  liberal mp isobel redmond to retire at 2018 st...   
194278     waste plan review to consider recycling options   

                                            preprocessed  
773630                 holman heroic save socceroos skin  
277869      cultural background affect one health report  
28030                   row brew radiation therapy delay  
1066306  libe

In [6]:

df_vec = pd.DataFrame(vec)
df_vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.025192,0.100360,-0.220985,0.216720,-0.375728,-0.136017,0.469670,-0.395489,0.293971,-0.341170,...,0.104766,-0.089663,-0.246274,0.206871,0.549405,-0.171528,-0.151983,0.137993,0.223821,0.065391
1,-0.042216,0.007807,-0.323374,0.239274,-0.171238,-0.051610,-0.085143,0.053878,-0.024637,0.036788,...,-0.028024,-0.056563,0.112036,-0.002028,-0.073680,-0.021632,-0.097797,0.187835,0.106588,0.096950
2,-0.092368,-0.189523,-0.009720,0.183215,-0.029319,-0.023563,-0.049403,-0.176174,0.098500,0.145629,...,0.094414,0.133662,0.143568,-0.100534,0.055244,-0.067328,-0.073239,0.133813,0.188375,-0.091567
3,-0.069223,0.120941,-0.256029,0.106836,-0.199290,-0.129381,0.045415,-0.080313,0.204904,0.049175,...,0.124354,-0.031129,0.090552,-0.103008,-0.142635,-0.183100,0.087854,0.279785,0.337483,0.008798
4,-0.084492,-0.263232,-0.093723,0.007081,0.005129,-0.178735,0.145290,-0.264792,-0.215830,0.267448,...,0.167543,0.006900,0.185153,0.171885,0.002973,-0.154298,-0.139211,-0.134581,0.106265,0.065847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,-0.163800,-0.100213,0.032106,0.223514,-0.118627,-0.005642,0.128791,-0.300536,-0.192485,0.086437,...,0.082058,-0.009398,-0.024724,0.204276,-0.183083,-0.199598,-0.194754,-0.159399,0.075623,0.100542
497,-0.161778,0.095010,0.005817,-0.011847,-0.233536,0.143503,0.026152,-0.236101,0.162048,0.082901,...,0.236287,-0.030454,0.090485,-0.074390,0.018952,-0.056369,-0.178116,0.221473,0.064358,0.114536
498,0.020485,-0.073872,-0.113093,-0.011969,-0.097694,-0.017673,0.075138,-0.014310,-0.137561,0.114322,...,0.197980,-0.047467,0.002451,-0.082549,0.098858,-0.091278,-0.003021,-0.030298,0.227170,0.033974
499,-0.049444,-0.041516,-0.072914,0.204023,-0.148864,0.113568,0.015243,-0.066202,0.089106,0.113648,...,0.239481,-0.072251,0.137760,0.068023,-0.064657,0.112621,0.149638,0.336501,0.356723,0.115489


In [7]:
len(sentences), sentences

(501,
 [['preprocessed'],
  ['holman', 'heroic', 'save', 'socceroos', 'skin'],
  ['cultural', 'background', 'affect', 'one', 'health', 'report'],
  ['row', 'brew', 'radiation', 'therapy', 'delay'],
  ['liberal', 'mp', 'isobel', 'redmond', 'retire', 'state', 'election'],
  ['waste', 'plan', 'review', 'consider', 'recycle', 'option'],
  ['png', 'government', 'tactic', "'cowardly"],
  ['local', 'council', 'advocate', 'social', 'change'],
  ['climber', 'scale', 'trump', 'tower', 'suction', 'cap'],
  ['larry', 'nassars', 'huge', 'sentence', 'feel', 'victory', 'too'],
  ['security', 'camera', 'help', 'combat', 'stock', 'theft'],
  ['brazil', 'court', 'appeal', 'athens', 'marathon', 'gold'],
  ['lake', 'blue', 'green', 'algae', 'threat', 'remains'],
  ['custom', 'software', 'stay', 'ellison', 'say'],
  ['litchfield', 'take', 'drl', 'premiership'],
  ['bronco', 'go', 'bang', 'whimper'],
  ['geraldton', 'welcome', 'cctv', 'funding'],
  ['minister', 'reassure', 'qld', 'rail', 'wind', 'back'],
  