In [None]:
!pip install -q --no-deps ../input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install -q --no-deps ../input/textstat-pypi/textstat-0.7.0-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno
import re
import string
import spacy
import matplotlib.pyplot as plt
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
from sklearn.feature_extraction.text import CountVectorizer
from textwrap import wrap
import nltk
from textblob import TextBlob
from gensim.models import word2vec
import gensim
import statistics
from sklearn.model_selection import KFold,StratifiedKFold
import textstat
from gensim.models import KeyedVectors


In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
#音節を数える
textstat.set_lang('en')

In [None]:
def preprocessing(df):
    #全文の長さ
    df["len_sentence"] = df.excerpt.apply(lambda x: len(x))
    #文の数(ピリオドの個数)
    df["count_sentence"] = df.excerpt.apply(lambda x: x.count(".")+x.count("!")+x.count("?"))
    #一文あたりの文の長さ
    df["len_per_sentence"] = df["len_sentence"]/df["count_sentence"]
    #なんとなく大文字がたくさんある文って簡単そうな気がするので大文字をカウント
    df["uppers"] = df.excerpt.apply(lambda x: len(re.findall(r"[A-Z]",x)))
    #小文字にそろえる
    df['excerpt'] = df.excerpt.apply(lambda x: x.lower())
    #数字が入った単語の消去
    df['excerpt']=df.excerpt.apply(lambda x: re.sub('\w*\d\w*','', x))
    # []内をブランクに
    df['excerpt']=df.excerpt.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    #改行の削除
    df['excerpt']=df.excerpt.apply(lambda x: re.sub('\\n','', x))
    #単語の数
    df["count_words"] = df.excerpt.apply(lambda x: len(x.split()))
    #単語の平均の長さ(概算)
    df["len_per_word"] = df["len_sentence"]/df["count_words"]
    #名詞の数
    df["count_nouns"] = df.excerpt.apply(lambda x: len(TextBlob(x).noun_phrases))
    #名詞の平均の長さ
    df["len_per_noun"] = df.excerpt.apply(lambda x: sum([len(y.split()[-1]) for y in TextBlob(x).noun_phrases]))/df["count_nouns"]
    #読みやすさ
    df["reading_ease"] = df.excerpt.apply(lambda x: textstat.flesch_reading_ease(x))
    #読みやすさ
    df["dale_chall"] = df.excerpt.apply(lambda x: textstat.dale_chall_readability_score(x))
    #読みやすさ
#     df["dale_chall_v2"] = df.excerpt.apply(lambda x: textstat.dale_chall_readability_score_v2(x))

    return df

In [None]:
train = preprocessing(train)
test = preprocessing(test)

In [None]:
train.corr()

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)
model_2 =  KeyedVectors.load_word2vec_format('../input/fasttext-wikinews/wiki-news-300d-1M.vec')

In [None]:
def words2wvs(words,wv):
    wvs = []
    for word in words:
        if len(word) > 0:
            try:
                tmp = wv.get_vector(word)
                wvs.append(tmp)
            except:
                pass
    wvs = np.array(wvs)
    return wvs

In [None]:
def mk_wv(df):
    vc = df.excerpt.apply(lambda x: np.mean(words2wvs(x.split(),model),axis=0))
    vc2 = df.excerpt.apply(lambda x: np.mean(words2wvs(x.split(),model_2),axis=0))
    arrl = []
    for arr,arr2 in zip(vc,vc2):
        arrl.append(arr+arr2)
    return pd.DataFrame(arrl)

In [None]:
feat = train.drop(columns=["excerpt","target","standard_error","id",
                   "url_legal","license"]).columns.to_list()

In [None]:
train_X=pd.concat([train[feat],pd.DataFrame(mk_wv(train))],axis=1)
train_y=train["target"]
test_X=pd.concat([test[feat],pd.DataFrame(mk_wv(test))],axis=1)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,accuracy_score

In [None]:
num_bins = int(np.floor(1 + np.log2(len(train))))
train.loc[:,'bins'] = pd.cut(train['target'],bins=num_bins,labels=False)

target = train['target'].to_numpy()
bins = train.bins.to_numpy()
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
def get_preds_lgbm(X,y,X_test,bins=bins,nfolds=5):
    importance=[]
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    lgbm_params = {'objective': 'regression',
                    'random_state':42,
                    'metric':'rmse',
                   'max_depth':25,
                   'learning_rate':0.045,
                   'num_leaves':25
                      }
    kfold = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=42)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        evals_result={}
        X_train,y_train = X.iloc[train_idx,:], y[train_idx]
        X_valid,y_valid = X.iloc[valid_idx,:], y[valid_idx]
        lgb_train=lgb.Dataset(X_train,y_train)
        lgb_eval=lgb.Dataset(X_valid,y_valid,reference=lgb_train)
        lgbm_model = lgb.train(lgbm_params, lgb_train, 
                               valid_sets=[lgb_train,lgb_eval],evals_result=evals_result,
                               num_boost_round = 3000,
                               early_stopping_rounds = 10,
                               verbose_eval = 1500,)
        prediction=lgbm_model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += lgbm_model.predict(X_test)
        print("mean rmse",np.mean(scores))
        importance.append(pd.DataFrame(lgbm_model.feature_importance(), index=train_X.columns, columns=['importance']))
    return np.array(preds)/nfolds,importance

In [None]:
res,importance = get_preds_lgbm(train_X,train_y,test_X)

In [None]:
25 25

0.6765291493911305

In [None]:
sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
sub["target"]=res

In [None]:
sub.to_csv("submission.csv",index=False)

In [None]:
sum(importance).sort_values(by="importance",ascending=False).head(30)