In [22]:
# lightgbmを試す。
import pandas as pd
import numpy as np
import re
from glob import glob
from tqdm import tqdm
import datetime

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from tqdm.notebook import tqdm
import sys 
tqdm.pandas()

### dataのロード

In [23]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
sub_df = pd.read_csv('../data/sample_submission.csv')

### ncodeを数値に置き換える

In [24]:
def processing_ncode(input_df: pd.DataFrame):
    output_df = input_df.copy()
    
    num_dict = {chr(i): i-65 for i in range(65, 91)}
    def _processing(x, num_dict=num_dict):
        y = 0
        for i, c in enumerate(x[::-1]):
            num = num_dict[c]
            y += 26**i * num
        y *= 9999
        return y
    
    tmp_df = pd.DataFrame()
    tmp_df['_ncode_num'] = input_df['ncode'].map(lambda x: x[1:5]).astype(int)
    tmp_df['_ncode_chr'] = input_df['ncode'].map(lambda x: x[5:])
    tmp_df['_ncode_chr2num'] = tmp_df['_ncode_chr'].map(lambda x: _processing(x))
    
    output_df['ncode_num'] = tmp_df['_ncode_num'] + tmp_df['_ncode_chr2num']
    return output_df

df_train = processing_ncode(df_train)
df_test = processing_ncode(df_test)

df_train_num = df_train.select_dtypes("int")
df_test_num = df_test.select_dtypes("int")


In [25]:
print("download .npy file")
train_title = np.load("../npy/train_title_roberta.npy")
train_story = np.load("../npy/train_story_roberta.npy")

test_title = np.load("../npy/test_title_roberta.npy")
test_story = np.load("../npy/test_story_roberta.npy")

download .npy file


In [26]:
## RoBERTaでベクトル化したやつを主成分分析をする
# 行列の標準化
title = np.concatenate([train_title, test_title])
story = np.concatenate([train_story, test_story])

# title = pd.DataFrame(title).progress_apply(lambda x: (x-x.mean())/x.std(), axis=0)
# story = pd.DataFrame(story).progress_apply(lambda x: (x-x.mean())/x.std(), axis=0)

train_title = title[:40000]
train_story = story[:40000]

test_title = title[40000:]
test_story = story[40000:]

train_title_df = pd.DataFrame(train_title)
train_story_df = pd.DataFrame(train_story)
test_title_df = pd.DataFrame(test_title)
test_story_df = pd.DataFrame(test_story)

for col_name in train_title_df.columns:
    train_title_df = train_title_df.rename(columns = {col_name:f"title_{col_name}"})
for col_name in train_story_df.columns:
    train_story_df = train_story_df.rename(columns = {col_name:f"story_{col_name}"})
for col_name in test_title_df.columns:
    test_title_df = test_title_df.rename(columns = {col_name:f"title_{col_name}"})
for col_name in test_story_df.columns:
    test_story_df = test_story_df.rename(columns = {col_name:f"story_{col_name}"})


In [27]:
print(train_title_df.shape)
print(test_title_df.shape)

(40000, 768)
(8522, 768)


### Universal Sentence Encoderのロード

In [28]:
train_title_univ = np.load("../npy/train_title_universal.npy")
test_title_univ = np.load("../npy/test_title_universal.npy")
title_univ = np.concatenate([train_title_univ, test_title_univ])
title_univ = pd.DataFrame(title_univ).progress_apply(lambda x: (x-x.mean())/x.std(), axis=0)
train_title_univ = title_univ[:40000]
test_title_univ = title_univ[40000:]
train_title_univ_df = pd.DataFrame(train_title_univ)
test_title_univ_df = pd.DataFrame(test_title_univ)

  0%|          | 0/512 [00:00<?, ?it/s]

In [29]:
print(train_title_univ_df.shape)
print(test_title_univ_df.shape)
test_title_univ_df.reset_index(drop=True)

(40000, 512)
(8522, 512)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.553802,1.179810,-0.309277,-1.192671,-1.702170,-0.560264,0.561032,0.791042,-0.388955,0.508979,...,0.014706,0.895717,0.941399,-0.956885,-0.412435,1.081936,-0.383723,0.637830,1.389404,-0.729709
1,-1.132025,1.277811,1.222784,-1.884186,0.976374,1.266295,-0.887914,0.341196,2.407060,-0.802979,...,0.311123,-1.447018,-2.413758,-0.576763,1.202906,1.380336,-0.493012,-1.203174,-1.263398,1.325745
2,1.183090,-0.219122,-1.421651,-1.695190,-0.322318,0.665059,-1.972587,0.143928,-1.196680,-2.229129,...,-0.782485,-0.230852,0.940889,-0.307891,-2.273559,0.650978,-1.752863,0.776857,0.002681,-0.478403
3,-0.840367,-0.179125,-1.096395,1.042632,1.236529,-1.337485,-1.888302,-0.341608,-0.499339,-0.099666,...,1.319402,-0.544030,0.811962,-1.491224,0.317093,-0.024473,-2.033506,0.843222,-0.778619,-2.595971
4,-0.647651,0.582763,0.033605,1.262320,1.386068,1.370153,-1.153296,0.967934,-0.576824,-0.410353,...,-0.277413,-1.560215,-0.030397,0.650446,-1.039097,2.135223,-0.654983,-0.538560,-1.040894,0.189927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8517,0.932830,0.831015,0.928344,0.611031,0.391854,1.162015,-2.456127,-2.093475,0.057378,-0.385820,...,0.209197,0.570729,-0.321338,-0.639791,1.430543,0.603228,-1.976362,0.259932,-1.599483,-0.421210
8518,-1.099986,-0.238371,0.211170,0.747204,-0.428654,0.863886,-1.532231,0.270991,0.572865,0.331378,...,-0.592458,0.786496,0.766292,1.575710,0.052530,-1.331911,-1.370003,1.096327,1.379221,-1.319634
8519,-0.025099,-1.460381,0.250847,0.888142,-1.038723,0.666857,0.875960,-1.609395,0.283658,0.708095,...,0.838663,0.589566,1.316377,-0.037597,0.168613,-0.972817,0.081764,1.008672,1.754141,-0.969855
8520,-1.294665,-1.243802,0.541097,-0.890208,1.113529,-0.643970,0.369844,0.844348,-0.006014,-0.071375,...,0.097320,-1.815799,0.230869,1.553532,1.550001,-1.291998,-0.830508,-0.576465,1.606731,-1.913846


In [30]:
pd.concat([df_train_num, train_title_df, train_title_univ_df, train_story_df, df_train[["general_firstup"]]], axis=1)
pd.concat([df_test_num, test_title_df, test_story_df], axis=1)

Unnamed: 0,userid,biggenre,genre,novel_type,end,isstop,isr15,isbl,isgl,iszankoku,...,story_758,story_759,story_760,story_761,story_762,story_763,story_764,story_765,story_766,story_767
0,952106,99,9903,2,0,0,0,0,0,0,...,0.057961,-0.023981,-0.080726,-0.151985,-0.023490,-0.032732,-0.015186,-0.406403,-0.003529,-0.261903
1,2197141,1,101,2,0,0,1,0,0,0,...,-0.072030,-0.002367,-0.135964,-0.012726,-0.162319,-0.002631,-0.091109,-0.223682,-0.027789,0.189134
2,2235083,3,302,2,0,0,0,0,0,0,...,0.041886,0.305208,0.076169,0.087082,0.173636,-0.018682,-0.096197,0.043770,0.037074,-0.257967
3,2236089,1,101,1,0,0,0,0,0,0,...,0.060052,0.085969,-0.056349,0.075030,-0.138266,-0.006569,0.235899,-0.077933,0.060647,-0.044415
4,414104,99,9902,2,0,0,0,0,0,0,...,-0.076148,0.033186,-0.314517,0.429918,-0.130872,0.014466,-0.052084,-0.123684,-0.193658,-0.167783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8517,2263100,99,9903,2,0,0,0,0,0,0,...,-0.081886,-0.076671,-0.005422,-0.040474,0.104257,-0.080240,0.161166,-0.242211,0.207153,-0.419711
8518,2254651,3,305,2,0,0,0,0,0,0,...,0.002651,-0.020552,-0.058386,0.018911,-0.112786,0.007172,0.151786,-0.133407,0.289686,-0.164395
8519,846989,3,302,2,0,0,0,0,0,0,...,-0.095822,0.151296,-0.187615,0.132005,-0.137551,0.034727,0.232804,-0.069443,0.001989,-0.123284
8520,2259847,2,202,2,0,0,0,0,0,0,...,-0.089301,-0.076599,0.160878,0.041910,0.336340,0.025795,-0.096498,-0.501368,0.123389,-0.182003


In [31]:
## dfをまとめる
df_train = pd.concat([df_train_num, train_title_df, train_title_univ_df, train_story_df, df_train[["general_firstup"]]], axis=1)
df_test = pd.concat([df_test_num, test_title_df, test_title_univ_df.reset_index(drop=True), test_story_df], axis=1)
## 学習データの期間を変更してみる
df_train["datetime"] = df_train['general_firstup'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date())
df_train = df_train[df_train["datetime"] > datetime.date(2020,1,1)].drop(columns=["datetime", "general_firstup"])
print(df_train.shape)
print(df_test.shape)

(28837, 2063)
(8522, 2062)


## 作成したデータを保存する

In [32]:

import os
os.makedirs("./data", exist_ok=True)

print(df_train.shape)
print(df_test.shape)

df_train.to_pickle("./data/train.pkl")
df_test.to_pickle("./data/test.pkl")


(28837, 2063)
(8522, 2062)
