In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# !pip install chart_studio
# !pip install textstat

import numpy as np 
import pandas as pd 

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords


# Visualisation libraries
import matplotlib.pyplot as plt
import plotly.graph_objs as go
# import chart_studio.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')


# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# File system manangement
import os

# Pytorch
import torch

#Transformers
from transformers import BertTokenizer

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')


# **Loading the Data**

In [None]:
# Training Data
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

print("Training data shape.. ",train.shape)
print("Testing data shape.. ",test.shape)

# First few rows of the training dataset
train.head()

# First few rows of the testing dataset
test.head()

## Checking the missing value

In [None]:
#Missing values in training set
train.isnull().sum()
#Missing values in test set
test.isnull().sum()

## Utility function

In [None]:
# text preprocessing helper functions

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

### Text preprocessing using the utility function

In [None]:
# Applying the cleaning function to both test and training datasets
train['excerpt_clean'] = train['excerpt'].apply(str).apply(lambda x: text_preprocessing(x))
test['excerpt_clean'] = test['excerpt'].apply(str).apply(lambda x: text_preprocessing(x))

In [None]:
train['excerpt_len'] = train['excerpt_clean'].astype(str).apply(len)
train['excerpt_count'] = train['excerpt_clean'].apply(lambda x: len(str(x).split()))

In [None]:
train['Character Count'] = train['excerpt'].apply(lambda x: len(str(x)))

In [None]:
train.head()

# Distribution of Excerpt length

In [None]:
train['excerpt_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Excerpt text Length Distribution')

In [None]:
trace0 = go.Box(
    y=train['excerpt_len'],
    name = 'Text',
    marker = dict(
        color = 'red',
    )
)

In [None]:
data = [trace0]
layout = go.Layout(
    title = "Length of the text"
)
fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Length of the text of different polarities")

## Distribution of excerpt count

In [None]:
train['excerpt_count'].iplot(
kind='hist',
bins=50,
xTitle='text_length',
linecolor='black',
color='green',
yTitle='count',
title='Excerpt text word count')

##   List the top n words in a vocabulary according to occurrence in a text corpus

In [None]:
#source of code : https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Unigram

In [None]:
unigrams=get_top_n_words(train['excerpt_clean'],20)
df1 = pd.DataFrame(unigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 Unigrams in excerpt text',orientation='h')

In [None]:
#Distribution of top Bigrams
def get_top_n_gram(corpus,ngram_range,n=None):
    vec = CountVectorizer(ngram_range=ngram_range,stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Bigrams

In [None]:
bigrams = get_top_n_gram(train['excerpt_clean'],(2,2),20)

In [None]:
for word,freq in bigrams:
    print(word,freq)

In [None]:
#for word, freq in top_bigrams:
    #print(word, freq)
df1 = pd.DataFrame(bigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='blue', title='Top 20 Bigrams in excerpt text',orientation='h')

# Trigrams

In [None]:
trigrams = get_top_n_gram(train['excerpt_clean'],(3,3),20)
df2 = pd.DataFrame(trigrams,columns=['Text','count'])

df2.groupby("Text").sum()['count'].sort_values(ascending=True).iplot(
kind='bar',yTitle='Count',linecolor='black',color='orange',title='Top 20 Trigrams Text',orientation='h')

In [None]:
tc = train['excerpt_clean']

In [None]:
from wordcloud import WordCloud
fig, (ax1) = plt.subplots(1, 1, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(tc))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('excerpt clean text',fontsize=40);


# Distribution of the target variable

In [None]:
train['target'].iplot(kind='hist',xTitle='Target',yTitle='Density',linecolor='black',color='blue')

# Distribution of Standard error

In [None]:
train['standard_error'].iplot(kind='hist',xTitle='standard_error',yTitle='Density',linecolor='black',color='blue')

# Building the baseline model

In [None]:
vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(1,6))
X = vectorizer.fit_transform(train.excerpt_clean,)
X.shape

In [None]:
%%time
model = LinearSVR(random_state=42)
scores = cross_val_score(model, X, train.target, cv=5,scoring='neg_root_mean_squared_error')
scores *=-1
scores.mean()

In [None]:
%%time
model.fit(X,train.target)

In [None]:
# sub = pd.read_csv('../input/commonlitreadabilityprize/test.csv',index_col='id')
# x = vectorizer.transform(sub.excerpt)
# p = model.predict(x)
# sub['target'] = p
# sub[['target']].to_csv('submission.csv')

# Now we will use LightGBM

In [None]:
from time import time
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2',ngram_range=(1,1))
features = tfidf.fit_transform(train.excerpt_clean).toarray()
features.shape

In [None]:
features_test = tfidf.transform(test.excerpt_clean).toarray()

In [None]:
params = {'metric': 'rmse','random_state': 48,'n_estimators': 20000,'reg_alpha': 0.0010819683712588644,
          'reg_lambda': 0.004760428916800031, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.01,
          'max_depth': 100, 'num_leaves': 39, 'min_child_samples': 12, 'cat_smooth': 67}
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(features,train['target']):
    X_tr,X_val=features[trn_idx],features[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = LGBMRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(features_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

In [None]:
import seaborn as sns
# Prediction distibution
plt.figure(figsize=(10,4))
sns.kdeplot(preds,shade=True)
plt.show()

In [None]:
sub = pd.read_csv('../input/commonlitreadabilityprize/test.csv',index_col='id')
sub['target']=preds
sub[['target']].to_csv('submission.csv')

In [None]:
# pip install sentence-transformers

In [None]:
# loading model using sentence transformers

# import sentence_transformers
# from sentence_transformers import SentenceTransformer, models

In [None]:
# setting model path for fine-tuned roberta weights

# model_path = '../input/finetuned-model1/checkpoint-568'
# word_embedding_model = models.Transformer(model_path, max_seq_length=275)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
# test.head()

In [None]:
# encoding train and test strings

# X_train = model.encode(train.excerpt, device='cuda')
# X_test = model.encode(test.excerpt, device='cuda')

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from datetime import datetime
# from sklearn.metrics import mean_squared_error
# from sklearn.linear_model import BayesianRidge

# preds = []
# train_scores = []

# df_oof=train.copy()
# df_oof['oof'] = 0

# skf = StratifiedKFold(10, shuffle=True, random_state=42)

# splits = list(skf.split(X=X_train, y=train['Character Count']))

# # predicting out of fold scores for each fold and doing predictions for each training set

# for i, (train_idx, val_idx) in enumerate(splits):
#     print(f'\n------------- Training Fold {i + 1} / {10}')
#     print("Current Time =", datetime.now().strftime("%H:%M:%S"))

#     clf = BayesianRidge(n_iter=300, verbose=True)
#     clf.fit(X_train[train_idx],train.target[train_idx])
#     train_score=mean_squared_error(train.target[train_idx], clf.predict(X_train[train_idx]), squared=False)
#     train_scores.append(train_score)
#     print(f"Fold {i} train RMSE: {train_score}")
    
    
#     preds.append(clf.predict(X_test))
#     x=clf.predict(X_train[val_idx])
#     df_oof['oof'].iloc[val_idx]+= x

# print(f'Training score: {np.mean(train_scores)}, Training STD: {np.std(train_scores)}')
# print(f'OOF score across folds: {mean_squared_error(df_oof.target, df_oof.oof, squared=False)}')

In [None]:
# getting mean prediction across 5 folds
# y_pred = np.mean(preds,0)
# y_pred.shape

In [None]:
# sub = test[["id"]].copy()
# sub["target"] = y_pred
# sub.to_csv('submission.csv', index = False)

In [None]:
# checking submission file

sub.head()