In [1]:
import pandas as pd
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
df = pd.concat([train, test])

In [2]:
train_ = df.iloc[:40000]
test_ = df.iloc[40000:]
print(train_.shape)
print(test_.shape)

(40000, 20)
(8522, 20)


In [6]:
train_story_tfidf = pd.read_pickle("../data/train_tfidf.pkl").reset_index(drop=True)
test_story_tfidf = pd.read_pickle("../data/test_tfidf.pkl").reset_index(drop=True)
print(train_story_tfidf.shape)

(120000, 1)


# TF-IDFを試す。

In [3]:
# 前処理用
import re
import os
import pandas as pd
import numpy as np
import emoji
import spacy
import neologdn
import json

# 分かち書き用
import ginza
import ja_ginza_electra
# pandas高速化
from pandarallel import pandarallel
pandarallel.initialize()
import json
import os
import emoji
import mojimoji
import neologdn
from tqdm.auto import tqdm
tqdm.pandas()

print("Load data!")
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Load data!


In [46]:
l = []
for i in train["story"]:
    if type(i) != str:
        print(i)
        l.append(type(i))
    else:
        pass
l

[]

In [24]:
emoji_json_path = "../emoji/emoji_ja.json"
json_open = open(emoji_json_path)
emoji_dict = json.load(json_open)

def wakati_rm_func(x):
    sentence = str(x)
    nlp = spacy.load('ja_ginza_electra')
    sentence = re.sub(r'[!-~]'," ",sentence) # 小文字の記号を削除
    sentence=re.sub(r'[︰-＠]', "", sentence) # 大文字の記号を削除


    # 不要記号削除
    pattern = '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”◇ᴗ●↓→♪★⊂⊃※△□◎〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％�]'
    sentence =  re.sub(pattern, ' ', sentence)

    # 正規化する
    sentence = neologdn.normalize(sentence)

    # 大文字・小文字変換
    sentence = sentence.replace("\n", "")
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", sentence)
    # 絵文字削除
    sentence = "".join(
        [
            "絵文字" + emoji_dict.get(c, {"short_name": ""}).get("short_name", "")
            if c in emoji.UNICODE_EMOJI["en"]
            else c
            for c in sentence
        ]
    )
    # GinZaで分かち書きをする
    doc = nlp(sentence)
    tmp_words_list = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ in ["PROPN", "NOUN", "ADJ", "VERB"]:
                tmp_words_list.append(token.orth_)

    result = " ".join(tmp_words_list)
    return result

In [25]:
print(wakati_rm_func("今日も高木さんは可愛いね"))
print(type("今日も高木さんは可愛いね"))

高木 さん 可愛い
<class 'str'>


In [26]:
df = pd.concat([train, test])
# trainに分かち書きを実行する
df["title_wakati"] = df[["title"]].parallel_apply(wakati_rm_func)
df["story_wakati"] = df[["story"]].parallel_apply(wakati_rm_func)
df["keyword_wakati"] = df[["keyword"]].parallel_apply(wakati_rm_func)


In [None]:
train = df.iloc[:40000]
test = df.iloc[40000:]
print(df.shape)

str

In [None]:
print(train.shape)
print(test.shape)

(40000, 20)
(8522, 19)


In [None]:
train_["story_wakati"][:5]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
39995   NaN
39996   NaN
39997   NaN
39998   NaN
39999   NaN
Name: story_wakati, Length: 40000, dtype: float64

In [None]:
# TI-IDFを計算する
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer()
X = model.fit_transform(train["story_wakati"])
story_tfidf_train = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())

model = TfidfVectorizer()
X = model.fit_transform(train["title_wakati"])
title_tfidf_train = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())

model = TfidfVectorizer()
X = model.fit_transform(train["keyword_wakati"])
keyword_tfidf_train = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())

pd.concat([story_tfidf_train, title_tfidf_train, keyword_tfidf_train]).to_pickle("data/train_tfidf.pkl")

Unnamed: 0,欠損
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
39995,1.0
39996,1.0
39997,1.0
39998,1.0


In [None]:
model = TfidfVectorizer()
X = model.fit_transform(test["story_wakati"])
story_tfidf_test = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())

model = TfidfVectorizer()
X = model.fit_transform(test["title_wakati"])
title_tfidf_test = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())

model = TfidfVectorizer()
X = model.fit_transform(test["keyword_wakati"])
keyword_tfidf_test = pd.DataFrame(data= X.toarray(), columns = model.get_feature_names())
print(story_tfidf_test.shape)
pd.concat([story_tfidf_test, title_tfidf_test, keyword_tfidf_test]).to_pickle("data/test_tfidf.pkl")