In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import string
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

# EDA

In [None]:
train

In [None]:
test

In [None]:
#dropping features url_legal and license 
train.drop(['url_legal','license'],axis=1,inplace=True)
test.drop(['url_legal','license'],axis=1,inplace=True)

In [None]:
sns.set_theme(context='notebook',style='darkgrid',palette='coolwarm')

sns.lmplot(x='target',y='standard_error',data=train)

In [None]:
plt.figure(figsize=(10,7))
sns.jointplot(x='target',y='standard_error',data=train,kind='hex',palette='rainbow')

# Text Preprocessing

In [None]:
def process_text(text):
    """
    process the text by removing extra words, punctuations, 
    numbers and tokenize the word vector
    """
    words = '' 
    for val in text:     
        #remove numbers
        rm_num = ''.join(char for char in val if not char.isdigit())
        #remove puntuations
        rm_pun = ''.join(char for char in rm_num if char not in string.punctuation)
        # split the value 
        tokens = rm_pun.split() 
        # Converts each token into lowercase 
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower() 
    
        #exclude the stopwords
        words += " ".join(word for word in tokens if word not in stopwords.words('english'))+" "
    return words

In [None]:
words = process_text(train['excerpt'])

In [None]:
#wordcloud
stw = set(STOPWORDS)
wordcloud = WordCloud(width = 1200, height = 1000, 
                background_color ='grey', colormap = 'rainbow',
                stopwords = stw, 
                min_font_size = 10).generate(words)
plt.figure(figsize = (12,10), facecolor = 'green') 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
X = train['excerpt']
y = train['target']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor()
grid={'n_estimators':[500,1000],'learning_rate':[.001,0.01],'max_depth':[1,2],'subsample':[.5,.75],'random_state':[1]}

search = GridSearchCV(estimator=gbr,param_grid=grid,scoring='neg_mean_squared_error',n_jobs=1)

pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=process_text)),  
    ('tfidf', TfidfTransformer()), 
    ('regressor',search),  
])



In [None]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Square:', metrics.explained_variance_score(y_test, y_pred))

In [None]:
pipeline.fit(X,y)

In [None]:
sub = pipeline.predict(test['excerpt'])

In [None]:
sub

In [None]:
submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
submission['id'] = test['id']
submission['target'] = sub

In [None]:
submission.to_csv("submission.csv", index=False)