# Importing Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import re

# for data cleaning
import string

# for calculating Polarity and Subjectivity
from textblob import TextBlob

# import all the necessary libraries
import warnings

#for Tokenization
import nltk

#for Wordscloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

#Ignoring unnecessory warnings
warnings.filterwarnings("ignore")                   

# for stopwords Removal
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# for removing accented and special chracters
import plotly.express as px
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier              #for data manipulation and analysis 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , accuracy_score , f1_score , classification_report , roc_curve , auc , roc_auc_score , zero_one_loss
from sklearn.linear_model import LinearRegression

# Importing and Reading

In [None]:
paths = [ '../input/commonlitreadabilityprize/sample_submission.csv' ,
          '../input/commonlitreadabilityprize/test.csv' , 
          '../input/commonlitreadabilityprize/train.csv'
          ]

train = pd.read_csv(paths[2])
test = pd.read_csv(paths[1])
submission = pd.read_csv(paths[0])

display( train.head() )

In [None]:
display( train.isnull().sum() )

In [None]:
train.drop(['url_legal' , 'license'] , axis=1 , inplace = True)

display( train.head() )

In [None]:
train['excerpt'][0]

# Preprocessing

In [None]:
stop_words = nltk.corpus.stopwords.words("english")

def preprocess(text):
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    text = text.str.replace(regex_pat, ' ')
    
    # removal of punctuations and numbers
    punc_remove = text.str.replace("[^a-zA-Z]", " ")
    
    # remove whitespace with a single space
    new_text=punc_remove.str.replace(r'\s+', ' ')
    
    # remove leading and trailing whitespace
    new_text=new_text.str.replace(r'^\s+|\s+?$','')
    
    # replace normal numbers with numbr
    new_text=new_text.str.replace(r'\d+(\.\d+)?','numbr')
    
    # removal of capitalization
    text_lower = new_text.str.lower()
    
    # tokenizing
    tokenized_text = text_lower.apply(lambda x: x.split())
    
    # removal of stopwords
    tokenized_text=  tokenized_text.apply(lambda x: [item for item in x if item not in stop_words])
    
    for i in range(len(tokenized_text)):
        tokenized_text[i] = ' '.join(tokenized_text[i])
        texts_p= tokenized_text
    
    return texts_p
    

In [None]:
#train['excerpt'] = train['excerpt'].apply(preprocess)
text = train.excerpt
preprocessed_text = preprocess(text)

train['preprocessed_excerpt'] = preprocessed_text

print(train['excerpt'][0])
print('')
print(train['preprocessed_excerpt'][0])


# Feature Engineering

**Help taken from https://www.kaggle.com/mohamedbakrey/eda-for-commonlit-rp-ml-predict**

Check for more fe (comparison on nouns , adj etc)

In [None]:
train.describe()

In [None]:
train['length'] = train['preprocessed_excerpt'].apply(len)

train.head()

**Text Polarity**

It is the expression that determines the sentimental aspect of an opinion. In textual data, the result of sentiment analysis can be determined for each entity in the sentence, document or sentence. The sentiment polarity can be determined as positive, negative and neutral.

In [None]:
def get_polarity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    pol = textblob.sentiment.polarity
    return pol

train['polarity'] = train['preprocessed_excerpt'].apply(get_polarity)
train.head()

**Text Subjectivity**

In natural language, subjectivity refers to expression of opinions, evaluations, feelings, and speculations and thus incorporates sentiment. Subjective text is further classified with sentiment or polarity.

In [None]:
# Lets calculate the Subjectvity of the Reviews
def get_subjectivity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    subj = textblob.sentiment.subjectivity
    return subj

# lets apply the Function
train['subjectivity'] = train['preprocessed_excerpt'].apply(get_subjectivity)
train.head()

In [None]:
train[['length','polarity','subjectivity']].describe().style.background_gradient(cmap = 'copper')

# Training (Feature Extraction)

**Bag of Words VS TFIDF**

**1) Bag of Words**

The bag-of-words model is a simplifying representation used in natural language processing and information retrieval (IR). In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity.

In [None]:
# Creating bag of words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=30000)

X = cv.fit_transform(train['preprocessed_excerpt']).toarray()
y = train.target

In [None]:
# Feature Scaling
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print(X_train.shape , X_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb

In [None]:
dic_models = {#'RandomForestRegressor' : RandomForestRegressor(criterion='mse'), 
              #'GradientBoosting' : GradientBoostingRegressor() ,
              'LightGBM' : lgb.LGBMRegressor(),
              'XGradientBoosting' : xgb.XGBRegressor()    
                }
            #'CBR' : cb.CatBoostRegressor() 
    
for i in dic_models:
    print('Training with ' + i + ' model. \n')
    
    model = dic_models[i].fit(X_train , y_train)
    
    #Predicting
    print('Predicting with ' + i + ' model. \n')
    pred = model.predict(X_test)
    
    # Using Accuracy Score for predicting models
    print("Accuracy of " + i + " Model is ", model.score(X_test , y_test))
    print("RMSE of " + i + " Model is ", np.sqrt(mean_squared_error(y_test , pred)))    
    print("------------------------------------------------------------------")
    print()    

In [None]:
#Fitting Linear Regression to the data set
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

#Calculating Details
print('LogisticRegressionModel Train Score is : ' , lin_reg .score(X_train, y_train))
print('LogisticRegressionModel Test Score is : ' , lin_reg .score(X_test, y_test))

rsquare no acc

**2) TFIDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(train['preprocessed_excerpt'] )

In [None]:
matrix = pd.DataFrame(tfidf.toarray())
matrix

In [None]:
# If you don't specify the random_state in the code, 
# then every time you run(execute) your code a new random value is generated 
# and the train and test datasets would have different values each time.
X = tfidf
y = train.target
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.25)

print("Training split input- ", X_train_tfidf.shape)   # X FOR INPUT
print("Testing split input- ", X_test_tfidf.shape)

print("\n\nY : Training split input- ", y_train.shape) # Y FOR TARGET.
print("Y : Testing split input- ", y_test.shape)


In [None]:
models_2 = {#'RandomForestRegressor' : RandomForestRegressor(), 
              'GradientBoosting' : GradientBoostingRegressor() ,    
              'LightGBM' : lgb.LGBMRegressor(),
              'XGradientBoosting' : xgb.XGBRegressor()        
            }

for i in models_2:
    print('Training with ' + i + ' model. \n')
    
    model = models_2[i].fit(X_train_tfidf , y_train)
    
    #Predicting
    print('Predicting with ' + i + ' model. \n')
    pred = model.predict(X_test_tfidf)
    
    # Using Accuracy Score for predicting models
    print("R2 score of " + i + " Model is ", model.score(X_test_tfidf , y_test))
    print("RMSE of " + i + " Model is ", np.sqrt(mean_squared_error(y_test , pred)))    
    print("------------------------------------------------------------------")
    print()    

In [None]:
#Fitting Linear Regression to the data set
lin_reg = LinearRegression()
lin_reg.fit(X_train_tfidf,y_train)

pred = lin_reg.predict(X_test_tfidf)

#Calculating Details
print('LogisticRegressionModel Train Score is : ' , lin_reg .score(X_train_tfidf, y_train))
print('LogisticRegressionModel Test Score is : ' , lin_reg .score(X_test_tfidf, y_test))
print("RMSE of LOGREGModel is ", np.sqrt(mean_squared_error(y_test , pred)))

**Train with DLP Roberta https://www.kaggle.com/riadalmadani/finetune-roberta-5-fold**

# Data Visualization

In [None]:
# lets check relation between Polarity and Subjectivity

plt.figure(figsize=(12,6))
sns.scatterplot(train['polarity'], train['subjectivity'])
plt.title('Polarity vs Subjectivity')
plt.show()

In [None]:
# lets plot the Wordscloud

cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(train['preprocessed_excerpt'])
sum_words = words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

wordcloud = WordCloud(background_color = 'lightcyan', width = 2000, height = 2000).generate_from_frequencies(dict(words_freq))

plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.title("Vocabulary from Reviews", fontsize = 20)
plt.show()

# Testing and Submission

In [None]:
#Training linear regression on complete Data

#Fitting Linear Regression to the data set
lin_reg = LinearRegression()
lin_reg.fit(X ,y)


In [None]:
X

In [None]:
test.head()

**Preprocessing test data**

In [None]:
#train['excerpt'] = train['excerpt'].apply(preprocess)
text = test.excerpt
preprocessed_text = preprocess(text)

test['preprocessed_excerpt'] = preprocessed_text

print(test['excerpt'][0])
print('')
print(test['preprocessed_excerpt'][0])


In [None]:
import collections
import copy

corpus_vocabulary = collections.defaultdict(None, copy.deepcopy(tfidf_vectorizer.vocabulary_))
tfidf_transformer_query_sec = TfidfVectorizer(vocabulary=corpus_vocabulary)
query_tfidf_matrix = tfidf_transformer_query_sec.fit_transform(test['preprocessed_excerpt'])

# # TF-IDF feature matrix
# tfidf = tfidf_vectorizer.transform(test['preprocessed_excerpt'] )

In [None]:
pred = lin_reg.predict(query_tfidf_matrix)

In [None]:
pred[:5]

In [None]:
submission.target = pred
submission.to_csv("submission.csv", index=False)

**Train using DLP as done in https://www.kaggle.com/riadalmadani/finetune-roberta-5-fold and https://www.kaggle.com/omkargangan/commonlit-readability-competition**