In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import SnowballStemmer
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
snowball = SnowballStemmer(language='english')

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
def text_cleaning(text):
    text = ''.join([k if k not in string.punctuation else ' ' for k in text])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    
    return text

In [None]:
train_df['excerpt'] = train_df['excerpt'].apply(text_cleaning)
test_df['excerpt'] = test_df['excerpt'].apply(text_cleaning)

In [None]:
def create_taggedDocument_from_text(row):
    text = row['excerpt']
    #text = remove_stopwords(text)
    
    textWordlist = nltk.word_tokenize(text)

    wordlist = [word for word in textWordlist]
    #wordlist = [snowball.stem(word) for word in textWordlist]
    return TaggedDocument(words=wordlist, tags=[row['id']])

In [None]:
train_df['taggedDocument'] = train_df.apply(create_taggedDocument_from_text, axis=1)
test_df['taggedDocument'] = test_df.apply(create_taggedDocument_from_text, axis=1)

In [None]:
training_docs = train_df['taggedDocument'].values.tolist() + test_df['taggedDocument'].values.tolist()

In [None]:
# 学習実行（パラメータを調整可能）
# documents:学習データ（TaggedDocumentのリスト）
# min_count=1:最低1回出現した単語を学習に使用する
# dm=0:学習モデル=DBOW（デフォルトはdm=1:学習モデル=DM）
dvmodel = Doc2Vec(documents=training_docs, 
                epochs=50, 
                alpha=0.0025, 
                min_alpha=0.000001, 
                sample=0.001, 
                min_count=5, 
                window=15, 
                negative=5,
                ns_exponent=0.75, 
                dbow_words=0, 
                dm=1)

In [None]:
#Doc2Vecからベクトルを特徴量として抽出
train_docvecs_df = pd.DataFrame()
test_docvecs_df = pd.DataFrame()


for Id in train_df["id"]:
    train_docvecs_df[Id] = dvmodel.dv[Id]
for Id in test_df["id"]:
    test_docvecs_df[Id] = dvmodel.dv[Id]

train_docvecs_df = train_docvecs_df.T
train_docvecs_df = train_docvecs_df.rename_axis('id').reset_index()

test_docvecs_df = test_docvecs_df.T
test_docvecs_df = test_docvecs_df.rename_axis('id').reset_index()

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train_docvecs_df.drop('id', axis=1), train_df['target'], test_size = 0.3, random_state=71)
lgb_train = lgb.Dataset(train_X.values, train_y.values)
lgb_eval = lgb.Dataset(val_X.values, val_y.values, reference=lgb_train)

In [None]:
params = {
    # 回帰問題
    'objective': 'regression',
    # RMSEで評価
    'metric': 'rmse',
}
lgbModel = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                     verbose_eval=100,  # 50イテレーション毎に学習結果出力
                     num_boost_round=1000,  # 最大イテレーション回数指定
                     early_stopping_rounds=500,
                    )

In [None]:
y_pred = lgbModel.predict(val_X.values, num_iteration=lgbModel.best_iteration)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

In [None]:
predicted = lgbModel.predict(test_docvecs_df.drop('id', axis=1).values, num_iteration=lgbModel.best_iteration)

In [None]:
predicted

In [None]:
my_submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
my_submission['target'] = predicted

In [None]:
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)