In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
stop_words = stopwords.words('english')

import spacy
nlp = spacy.load('en_core_web_lg')
from spacy import displacy
import string
import re
from collections import Counter

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
def removeStopwords(text):
    doc = nlp(text)
    clean_text = ' '
    for txt in doc:
        if (txt.is_stop == False):
            clean_text = clean_text + ' ' + str(txt)
    return clean_text

In [None]:
def removePunctuations(text):
    return text.translate(str.maketrans('','', string.punctuation))

In [None]:
def removeLinks(text):
    clean_text = re.sub('https?://\S+|www\.\S+', '', text)
    
    return clean_text

In [None]:
def removeNumbers(text):
    clean_text = re.sub(r'\d+','',text)
    return clean_text

In [None]:
def clean(text):
    text = text.lower()
    text = removeStopwords(text)
    text = removePunctuations(text)
    text = removeLinks(text)
    text = removeNumbers(text)
    return text

In [None]:
train_df['excerpt_clean'] = train_df['excerpt'].apply(clean)
test_df['excerpt_clean'] = test_df['excerpt'].apply(clean) 

In [None]:
results = Counter()
train_df['excerpt_clean'].str.lower().str.split().apply(results.update)
print(len(results.keys()))

In [None]:
train_df['excerpt_clean']

In [None]:
stemmer = SnowballStemmer(language = 'english')
tokens = train_df['excerpt_clean'][1].split()
clean_text = ''
for token in tokens:
    print(token + '-->' + stemmer.stem(token))

In [None]:
def stemWord(text):
    stemmer = SnowballStemmer(language='english')
    tokens = text.split()
    clean_text = ''
    for token in tokens:
        clean_text = clean_text + ' ' + stemmer.stem(token)
    return clean_text

In [None]:
train_df['excerpt_clean'] = train_df['excerpt_clean'].apply(stemWord)

In [None]:
test_df['excerpt_clean'] = test_df['excerpt_clean'].apply(stemWord)

In [None]:
results = Counter()
train_df['excerpt_clean'].str.lower().str.split().apply(results.update)
print(len(results.keys()))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_df['excerpt_clean']
y = train_df['target']

X_test = test_df['excerpt_clean']


print(len(X),len(y))
print(len(X_test))

In [None]:
text = train_df['excerpt_clean']

In [None]:
vocab_size = 16662
embedding_dim = 64
max_length = 50
tranc_type = 'post'
pad_type = 'post'
oov_tok = '<OOV>'

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [None]:
training_sequences = tokenizer.texts_to_sequences(text)
training_padded = pad_sequences(training_sequences, maxlen = max_length,truncating=tranc_type,padding = pad_type)
training_label_final = np.array(train_df['target'])

In [None]:
testing_sequences = tokenizer.texts_to_sequences(test_df['excerpt_clean'])
testing_padded = pad_sequences(testing_sequences , maxlen = max_length, truncating=tranc_type,padding = pad_type)

In [None]:
training_padded 

In [None]:
training_padded.shape

In [None]:
testing_padded.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(training_padded,training_label_final,random_state = 42)

Linear Regresion Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
y_pred = lr_model.predict(X_val)
print("MSE:", mse(y_val,y_pred))
print("RMSE:", np.sqrt(mse(y_val,y_pred)))

Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

gb_model = GradientBoostingRegressor(random_state = 42)
gb_model.fit(X_train,y_train)
y_pred = gb_model.predict(X_val)
print("MSE:", mse(y_val,y_pred))
print("RMSE:", np.sqrt(mse(y_val,y_pred)))

Gradient Boosting With Hyperparameter Tunning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gb_model = GradientBoostingRegressor()

param_grid = {'n_estimators': range(20,81,10), 'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200),'max_features':range(7,20,2),
             'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}


random_search = RandomizedSearchCV(gb_model,param_grid, cv = 5,scoring = 'neg_mean_squared_error',verbose=1)
random_search.fit(training_padded,training_label_final)

In [None]:
random_search.best_estimator_

In [None]:
model = GradientBoostingRegressor(max_depth = 13, max_features = 19,min_samples_split = 600, n_estimators = 70,subsample=0.75)
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
print("MSE:", mse(y_val,y_pred))
print("RMSE:", np.sqrt(mse(y_val,y_pred)))

GLOVE EMBEDDING

In [None]:
embeddings_index = {}
GLOVE_DIR = '../input/glove6b/glove.6B.50d.txt'
f = open(GLOVE_DIR)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embeddings_index['dog'].shape

In [None]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for word in words:
        try:
            M.append(embeddings_index[word])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(50)
    return v / np.sqrt((v ** 2).sum())
    

In [None]:
X_train,X_val,y_train,y_valid = train_test_split(train_df['excerpt_clean'],train_df['target'],test_size = 0.2,random_state = 42)

In [None]:
X_train_glove = [sent2vec(x) for x in X_train]
X_val_glove = [sent2vec(x) for x in X_val]
X_test_glove = [sent2vec(x) for x in X_test]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gb_model = GradientBoostingRegressor()

param_grid = {'n_estimators': range(20,81,10), 'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200),'max_features':range(7,20,2),
             'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}


random_search = RandomizedSearchCV(gb_model,param_grid, cv = 5,scoring = 'neg_mean_squared_error',verbose=2)
random_search.fit(X_train_glove,y_train)

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
model = random_search.best_estimator_
model.fit(X_train_glove,y_train)
y_pred = model.predict(X_val_glove)
print("MSE:", mse(y_valid,y_pred))
print("RMSE:", np.sqrt(mse(y_valid,y_pred)))

In [None]:
submission

In [None]:
submission['target'] = model.predict(X_test_glove)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False)