# Pre-steps before ML

In [4]:
%store -r DF_INPUT_FILE
%store -r EMBEDDING_DIM
%store -r OOV_TOKEN
%store -r WORD2VEC_BIN_FILE
%store -r TOKENIZER_FILE
%store -r NO_EXIST_WORDS_FILE
%store -r NO_EXIST_WORDS_COUNT_STAT_FILE
%store -r WORD2VEC_MATRIX_FILE


In [5]:
import pickle
with open(DF_INPUT_FILE, "rb") as fr:
    df = pickle.load(fr)


In [6]:
from keras.preprocessing.text import Tokenizer
from copy import deepcopy
import json

all_sentence_gs = df['gs'].values

tokenizer = Tokenizer(num_words=None, oov_token=OOV_TOKEN)

old_word_index = deepcopy(tokenizer.word_index)

tokenizer.fit_on_texts(all_sentence_gs)

for item in tokenizer.word_index.keys():
    if item not in old_word_index.keys():
        old_word_index[item] = len(old_word_index)+1

tokenizer.word_index = old_word_index
word_index = old_word_index
# fetch counts for all words except oov_token
words_count = json.loads(tokenizer.get_config()['word_counts'])


In [7]:
import numpy as np
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

word2vec_matrix = KeyedVectors.load_word2vec_format(
    datapath(WORD2VEC_BIN_FILE), binary=True
)

no_exist_words = {}

for word in word_index.keys():
    idx = word_index[word]
    try:
        embedding_matrix[idx] = word2vec_matrix[word]
    except KeyError:
        if word == OOV_TOKEN:
            continue
        no_exist_words[word] = words_count[word]
    except Exception as e:
        print(f"halt due to fatal error:{str(e)}")
        break


In [8]:
# 一些简单的统计
import pandas as pd
df_no_exist_words = pd.DataFrame.from_records(
    [(k, v) for k,v in no_exist_words.items()],
    columns=['term', 'c']
)

print(f"df_no_exist_words.shape = {df_no_exist_words.shape}")
print(f"df_no_exist_words.describe: {df_no_exist_words.describe()}")
print(f"95% quantile at df_no_exist_words: {df_no_exist_words.quantile(0.95)}")
print(f"5% of df_no_exist_words = {df_no_exist_words.shape[0] * 0.05}")



df_no_exist_words.shape = (30968, 2)
df_no_exist_words.describe:                   c
count  30968.000000
mean       6.371351
std       47.744301
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max     3568.000000
95% quantile at df_no_exist_words: c    18.0
Name: 0.95, dtype: float64
5% of df_no_exist_words = 1548.4


In [10]:
%matplotlib inline
df_no_exist_words.c.to_frame("count").plot.box()

TypeError: 'method' object is not subscriptable

In [9]:
import os
import pickle
from pathlib import Path

# 输出不存在于字典中的词
df_no_exist_words.to_csv(NO_EXIST_WORDS_FILE)
df_no_exist_words.groupby("c").count().to_csv(NO_EXIST_WORDS_COUNT_STAT_FILE)


# dump the tokenizer
pickle.dump(
    tokenizer, open(TOKENIZER_FILE, "wb")
)
# dump the embedding matrix
pickle.dump(
    embedding_matrix, open(WORD2VEC_MATRIX_FILE, "wb")
)