In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install -q tensorflow_text neologdn mojimoji emoji pandarallel

[K     |████████████████████████████████| 4.9 MB 8.5 MB/s 
[K     |████████████████████████████████| 57 kB 5.3 MB/s 
[K     |████████████████████████████████| 125 kB 68.4 MB/s 
[K     |████████████████████████████████| 170 kB 70.3 MB/s 
[?25h  Building wheel for neologdn (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone


In [3]:
%cd /content/gdrive/MyDrive/competition/nishika/narou/
%ls

/content/gdrive/MyDrive/competition/nishika/narou
[0m[01;34mcatboost_info[0m/  [01;34memoji[0m/                    le.ipynb    [01;34mnpy[0m/
[01;34mdata[0m/           generate_universal.ipynb  main.ipynb  try_catboost.ipynb


In [4]:
import pandas as pd
import numpy as np
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [5]:
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
from pandarallel import pandarallel
pandarallel.initialize()
# for avoiding error
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [15]:
# 前処理用
import re
import os
import pandas as pd
import numpy as np
import neologdn
import json
import emoji
import mojimoji
emoji_json_path = "./emoji/emoji_ja.json"
json_open = open(emoji_json_path)
emoji_dict = json.load(json_open)

def clean_sentence(sentence: str) -> str:
    sentence = str(sentence)
    sentence = re.sub(r"<[^>]*?>", "", sentence)  # タグ除外
    sentence = mojimoji.zen_to_han(sentence, kana=False)
    sentence = neologdn.normalize(sentence)
    sentence = re.sub(
        r'[!"#$%&\'\\\\()*+,\-./:;<=>?@\[\]\^\_\`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠？！｀＋￥％︰-＠]。、♪',
        " ",
        sentence,
    )  # 記号
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", sentence)
    sentence = re.sub(r"[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+", " ", sentence)

    sentence = "".join(
        [
            "絵文字" + emoji_dict.get(c, {"short_name": ""}).get("short_name", "")
            if c in emoji.UNICODE_EMOJI["en"]
            else c
            for c in sentence
        ]
    )
    return sentence

In [16]:
print("start clean_sentence")

text_col = ["title", "story", "keyword"]
for t_col_name in text_col:
    train[t_col_name] = train[t_col_name].map(clean_sentence)
    test[t_col_name] = test[t_col_name].map(clean_sentence)
print("Done")

start clean_sentence
Done


In [None]:
# from tqdm.notebook import tqdm
# tqdm.pandas()

# for text_col_name in tqdm(text_col):
#     train_text_list = train[text_col_name]
#     test_text_list = test[text_col_name]
#     train_embed = embed(train_text_list)
#     test_embed = embed(test_text_list)
#     np.save(f'./npy/train_{text_col_name}_universal', np.stack(train_embed))
#     np.save(f'./npy/test_{text_col_name}universal', np.stack(test_embed))

## バッチ処理を試す

### story

In [39]:
# かなり大きなメモリを使用するので、念の為データを分割しバッチ処理
df = pd.concat([train, test])
text_col_name = "story"
stories = df[text_col_name]
nrow_one_loop = 1000
nloop = np.floor(len(stories)/nrow_one_loop)
min_idx = 0

story_pos_dfs = []

while min_idx < len(stories):
    tmp_stories = stories[min_idx:min_idx+nrow_one_loop]
    tmp_train_embed = pd.DataFrame(embed(tmp_stories).numpy())
    story_pos_dfs.append(tmp_train_embed)
    min_idx += nrow_one_loop

story_df = pd.concat(story_pos_dfs)
del story_pos_dfs

for col_name in story_df.columns:
    story_df = story_df.rename(columns = {col_name:f"title_{col_name}"})
print(story_df.shape)

(48522, 512)


In [45]:
story_train = story_df.iloc[:40000]
story_test = story_df.iloc[40000:]
print(story_train.shape)
print(story_test.shape)
story_train.to_pickle("./npy/univ_story_train.pkl")
story_test.to_pickle("./npy/univ_story_test.pkl")

(40000, 512)
(8522, 512)


### keyword

In [46]:
# かなり大きなメモリを使用するので、念の為データを分割しバッチ処理
df = pd.concat([train, test])
text_col_name = "keyword"
keywords = df[text_col_name]
nrow_one_loop = 1000
nloop = np.floor(len(keywords)/nrow_one_loop)
min_idx = 0

keyword_pos_dfs = []

while min_idx < len(keywords):
    tmp_keywords = keywords[min_idx:min_idx+nrow_one_loop]
    tmp_train_embed = pd.DataFrame(embed(tmp_keywords).numpy())
    keyword_pos_dfs.append(tmp_train_embed)
    min_idx += nrow_one_loop

keyword_df = pd.concat(keyword_pos_dfs)
del keyword_pos_dfs

for col_name in keyword_df.columns:
    keyword_df = keyword_df.rename(columns = {col_name:f"title_{col_name}"})
print(keyword_df.shape)

keyword_train = keyword_df.iloc[:40000]
keyword_test = keyword_df.iloc[40000:]
print(keyword_train.shape)
print(keyword_test.shape)
keyword_train.to_pickle("./npy/univ_keyword_train.pkl")
keyword_test.to_pickle("./npy/univ_keyword_test.pkl")

(48522, 512)
(40000, 512)
(8522, 512)


### title

In [47]:
# かなり大きなメモリを使用するので、念の為データを分割しバッチ処理
df = pd.concat([train, test])
text_col_name = "title"
titles = df[text_col_name]
nrow_one_loop = 1000
nloop = np.floor(len(titles)/nrow_one_loop)
min_idx = 0

title_pos_dfs = []

while min_idx < len(titles):
    tmp_titles = titles[min_idx:min_idx+nrow_one_loop]
    tmp_train_embed = pd.DataFrame(embed(tmp_titles).numpy())
    title_pos_dfs.append(tmp_train_embed)
    min_idx += nrow_one_loop

title_df = pd.concat(title_pos_dfs)
del title_pos_dfs

for col_name in title_df.columns:
    title_df = title_df.rename(columns = {col_name:f"title_{col_name}"})
print(title_df.shape)

title_train = title_df.iloc[:40000]
title_test = title_df.iloc[40000:]
print(title_train.shape)
print(title_test.shape)
title_train.to_pickle("./npy/univ_title_train.pkl")
title_test.to_pickle("./npy/univ_title_test.pkl")

(48522, 512)
(40000, 512)
(8522, 512)
