



This notebook is a LightGBM learning & inference model using Word2vec.  It's a very light model so it can be run on a CPU.

Word2vec represents words in 300 dimensions. By averaging the 300-dimensional vectors of the words in the sentence, the sentence was represented in 300 dimensions.

Since the parameters are hardly changed, there is a possibility of improving the score.

In [None]:
import numpy as np
import pandas as pd
import gensim

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()

# Embedding by Word2vec

In [None]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)
print(word2vec_model.vectors.shape)

In [None]:
def avg_feature_vector(sentence, model, num_features):
    words = sentence.replace('\n'," ").replace(',',' ').replace('.'," ").split()
    feature_vec = np.zeros((num_features,),dtype="float32")#特徴ベクトルの初期化
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, model[word])
        except KeyError as error:
            feature_vec 
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

In [None]:
word2vec_train = np.zeros((len(train.index),300),dtype="float32")#特徴ベクトルの初期化
word2vec_test = np.zeros((len(test.index),300),dtype="float32")

for i in range(len(train.index)):
    word2vec_train[i] = avg_feature_vector(train["excerpt"][i],word2vec_model, 300)
    
for i in range(len(test.index)):
    word2vec_test[i] = avg_feature_vector(test["excerpt"][i],word2vec_model, 300) 

In [None]:
print(word2vec_train.shape)
print(target.shape)
print(word2vec_test.shape)

# Training & Inference
lightgbm (KFold=5)

In [None]:
#parameter settings
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42,
    'learning_rate': 0.01,
    "n_jobs": -1,
    "verbose": -1
}

pred = np.zeros(test.shape[0])

In [None]:
#KFold 　n_splits=5
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True, random_state=42)
cv=list(fold.split(word2vec_train, target))

In [None]:
rmses = []
for tr_idx, val_idx in cv: 
    x_tr, x_va = word2vec_train[tr_idx], word2vec_train[val_idx]
    y_tr, y_va = target[tr_idx], target[val_idx]
        
    train_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=train_set)
        
    # Training
    model = lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=100,
                      valid_sets=[train_set, val_set], verbose_eval=-1)
        
    y_pred = model.predict(x_va)
    rmse =  np.sqrt(mean_squared_error(y_va, y_pred))
    rmses.append(rmse)
        
    #Inference
    test_pred = model.predict(word2vec_test)
    pred += test_pred / 5  
        
print("\n", "Mean Fold RMSE:", np.mean(rmses))    

In [None]:
sample_submission.target = pred
sample_submission.to_csv('submission.csv',index=False)

In [None]:
sample_submission