**I wrote this while looking at https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline., https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline**

**datasets: https://www.kaggle.com/alvaromunoz/textstat**

**datasets: https://www.kaggle.com/nltkdata/stopwords**

**How to https://stackoverflow.com/questions/3522372/how-to-config-nltk-data-directory-from-code**

In [None]:
!pip install ../input/textstat/Pyphen-0.10.0-py3-none-any.whl
!pip install ../input/textstat/textstat-0.7.0-py3-none-any.whl

In [None]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import re
import nltk
import textstat
import time
import spacy
import gensim


nltk.data.path.append("/kaggle/input/stopwords/stopwords")

from pandas import DataFrame
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import Counter
from wordcloud import WordCloud,STOPWORDS
from spacy import displacy
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
import lightgbm as lgb
import optuna
#nltk.download('stopwords')

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
excerpt1 = train_df['excerpt'].min()

e = re.sub("[^a-zA-Z]", " ", excerpt1)
e = e.lower()
        
e = nltk.word_tokenize(e)
        
e = [word for word in e if not word in set(stopwords.words("english"))]
        
lemma = nltk.WordNetLemmatizer()
e = [lemma.lemmatize(word) for word in e]
e=" ".join(e)

In [None]:
#====== Preprocessing function ======
def preprocess(data):
    excerpt_processed=[]
    for e in data['excerpt']:
        
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        
        # convert to lower case
        e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        
        # remove stopwords
        e = [word for word in e if not word in set(stopwords.words("english"))]
        
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed 

In [None]:
train_df["excerpt_preprocessed"] = preprocess(train_df)
test_df["excerpt_preprocessed"] = preprocess(test_df)

In [None]:
def avg_feature_vector(sentence, model, num_features):
    words = sentence.replace('\n'," ").replace(',',' ').replace('.'," ").split()
    feature_vec = np.zeros((num_features,),dtype="float32")#特徴ベクトルの初期化
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, model[word])
        except KeyError as error:
            feature_vec 
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

# **Part-of-Speech tagging**

In [None]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
word2vec_train = np.zeros((len(train_df.index),300),dtype="float32")
word2vec_test = np.zeros((len(test_df.index),300),dtype="float32")

for i in range(len(train_df.index)):
    word2vec_train[i] = avg_feature_vector(train_df["excerpt_preprocessed"][i],word2vec_model, 300)
    
for i in range(len(test_df.index)):
    word2vec_test[i] = avg_feature_vector(test_df["excerpt_preprocessed"][i],word2vec_model, 300)

## Optuna + LightGBM
1. optimize params
1. cross validation

In [None]:
target = train_df['target'].to_numpy()

params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42,
    'n_jobs': -1
}


def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'seed': 42,
        'n_jobs': -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    train_x, test_x, train_y, test_y = train_test_split(word2vec_train, target, test_size=0.25, random_state=42)
    train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)
    
    train_set = lgb.Dataset(train_x, train_y)
    valid_set = lgb.Dataset(valid_x, valid_y, reference=train_set)
    
    model = lgb.train(params,
                      train_set,
                      num_boost_round=10000,
                      early_stopping_rounds=100,
                      valid_sets=[train_set, valid_set])
    
    for_param_pred = model.predict(test_x)
    rmse = np.sqrt(mse(test_y, for_param_pred))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)


pred = np.zeros(test_df.shape[0])

fold = KFold(n_splits=5, shuffle=True, random_state=42)
cv=list(fold.split(word2vec_train, target))
rmses = []
for tr_idx, val_idx in cv: 
    x_tr, x_va = word2vec_train[tr_idx], word2vec_train[val_idx]
    y_tr, y_va = target[tr_idx], target[val_idx]
        
    train_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=train_set)
        
    # Training
    for_inference_param = {
                            'boosting_type': 'gbdt',
                            'metric': 'rmse',
                            'objective': 'regression',
                            'seed': 42,
                            'n_jobs': -1}
    
    for_inference_param.update(study.best_params)
    model = lgb.train(for_inference_param,
                      train_set,
                      num_boost_round=10000,
                      early_stopping_rounds=100,
                      valid_sets=[train_set, val_set])
        
    y_pred = model.predict(x_va)
    rmse =  np.sqrt(mse(y_va, y_pred))
    rmses.append(rmse)
        
    #Inference
    test_pred = model.predict(word2vec_test)
    pred += test_pred / 5  

# submission file

In [None]:
sample_submission.target = pred
sample_submission.to_csv('submission.csv',index=False)

In [None]:
sample_submission