In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor, XGBRFRegressor

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from string import punctuation

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set plot rc parameters

# jtplot.style(grid=False)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = '#464646'
#plt.rcParams['axes.edgecolor'] = '#FFFFFF'
plt.rcParams['figure.figsize'] = 10, 7
plt.rcParams['text.color'] = '#666666'
plt.rcParams['axes.labelcolor'] = '#333333'
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.color'] = '#666666'
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.color'] = '#666666'
plt.rcParams['ytick.labelsize'] = 14

# plt.rcParams['font.size'] = 16

sns.color_palette('dark')
%matplotlib inline

tqdm.pandas()

## Load Data

In [None]:
dftrain = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
dftest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

## EDA

In [None]:
sample_submission.head()

In [None]:
dftrain.shape, dftest.shape

In [None]:
dftrain.head()

In [None]:
wctrain = dftrain['excerpt'].apply(lambda x: len(x.split()))
wctest = dftest['excerpt'].apply(lambda x: len(x.split()))

In [None]:
wctrain.max()

In [None]:
wctest

In [None]:
def clean_text(sentence):
    # remove numbers
    pattern = re.compile(r'[0-9]+')
    sentence = sentence.lower()
    sentence = pattern.sub(' ', sentence).strip()
    # remove punctuations
    newSentence = ''
    for char in sentence:
        if char not in punctuation:
            newSentence += char
    # Tokenize
    word_list = word_tokenize(newSentence)
    # stop words
    stopwords_list = set(stopwords.words('english'))
    # remove stop words
    word_list = [word for word in word_list if word not in stopwords_list]
    # stemming
    ps  = PorterStemmer()
    word_list = [ps.stem(word) for word in word_list]
    # list to sentence
    sentence = ' '.join(word_list)
    
    return word_list

In [None]:
dftrain['clean_text'] = dftrain['excerpt'].progress_apply(clean_text)

In [None]:
dftest['clean_text'] = dftest['excerpt'].apply(clean_text)

## Vectorize text data

In [None]:
X = dftrain['clean_text'].to_list()

In [None]:
# tfidf = TfidfVectorizer()
# X = tfidf.fit_transform(dftrain['clean_text'])
# Xtest = tfidf.transform(dftest['clean_text'])

In [None]:
Xtrain, Xcv, Ytrain, Ycv = train_test_split(X, dftrain['target'], test_size=0.25, random_state=21)

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(Xtrain)]
model = Doc2Vec(documents, vector_size=700, window=4, min_count=1, workers=4)

In [None]:
train_list = [model.infer_vector(doc) for doc in Xtrain]
cv_list = [model.infer_vector(doc) for doc in Xcv]
test_list = [model.infer_vector(doc) for doc in dftest['clean_text'].to_list()]

In [None]:
train_arr = np.array(train_list)
cv_arr = np.array(cv_list)
test_arr = np.array(test_list)

## Train models

In [None]:
def print_summary(model, Xtrain, Ytrain, Xcv, Ycv):
    Ytrain_pred = model.predict(Xtrain)
    Ycv_pred = model.predict(Xcv)
    
    train_rmse = np.sqrt(metrics.mean_squared_error(Ytrain, Ytrain_pred))
    cv_rmse = np.sqrt(metrics.mean_squared_error(Ycv, Ycv_pred))
    
    print('Training RMSE: {}'.format(train_rmse))
    print('Validation RMSE: {}'.format(cv_rmse))

### XGBoost

In [None]:
xgb = XGBRegressor()
xgb.fit(train_arr, Ytrain)

In [None]:
print_summary(xgb, train_arr, Ytrain, cv_arr, Ycv)

In [None]:
xgbrf = XGBRFRegressor()
xgbrf.fit(train_arr, Ytrain)

In [None]:
print_summary(xgbrf, train_arr, Ytrain, cv_arr, Ycv)

## Prediction

In [None]:
Ytest = xgb.predict(test_arr)

In [None]:
submission = pd.DataFrame({'id': dftest['id'], 'target': Ytest})

In [None]:
submission.to_csv('submission.csv', index=False)