![](https://storage.googleapis.com/kaggle-competitions/kaggle/25914/logos/header.png)

This notebook mainly covers basic explorations of the data and a baseline Linear Model. Since the size of the test data is very less in this case, nothing was performed on it.

At the end of the notebook please feel free to share your thoughts and ideas along with feedback for the overall kernel

Following points are covered in this notebook:

i) The EDA part

ii) Two classes easy and difficult have been created based on the readability score and EDA has been done to make it more simple. Also baseline classification models have been developed for comparison

iii) Last and final a regression model has been created to predict the actual readability scores (to be added)

#### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
color = sns.color_palette()
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
import os
os.listdir("../input/commonlitreadabilityprize")
from nltk.corpus import stopwords
import string
eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.head()

#### Simple understanding of the Data

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
round((df.isnull().sum()/df.shape[0])*100, 2) # % of null values in each column

In [None]:
# Removing unnecessary columns
df.drop(['url_legal', 'license'], axis=1, inplace=True)

#### EDA

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(df['excerpt'])

##### Checking distribution of target variable

In [None]:
import plotly.express as px
fig = px.histogram(df, x="target")
fig.show()

In [None]:
df['excerpt_length']=df['excerpt'].apply(len)

##### Check length of passage with level of ease of reading passages

In [None]:
# create easy readability flag; 0 (difficult) - when target <= 0, 1 (easy) - when target > 0
df['is_easy_excerpt'] = np.where(df['target'] <= 0, 0, 1)

sns.set(font_scale=1.0)

g = sns.FacetGrid(df,col='is_easy_excerpt',size=5)
g.map(plt.hist,'excerpt_length')

In [None]:
cnt_srs = df['is_easy_excerpt'].astype(str).value_counts().head()
trace = go.Bar(
    x=cnt_srs.index[::-1],
    y=cnt_srs.values[::-1],
    orientation = 'v',
    marker=dict(
        color=cnt_srs.values[::-1],
        colorscale = 'agsunset',
        reversescale = True
    ),
)

layout = dict(
    title='Easy v Difficult Passages distribution',
    )
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="Readability")

0 corresponds to easy excerpts and 1 to difficult 

#### Creating Meta Features

In [None]:
df["num_words"] = df["excerpt"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
df["num_unique_words"] = df["excerpt"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
df["num_chars"] = df["excerpt"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
df["num_stopwords"] = df["excerpt"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
df["num_punctuations"] =df['excerpt'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
df["num_words_upper"] = df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
df["num_words_title"] = df["excerpt"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
df["mean_word_len"] = df["excerpt"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

#### No. of words by easy of reading

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x='is_easy_excerpt', y='num_words', data=df)
plt.xlabel('Passage Easy to Read', fontsize=12)
plt.ylabel('Number of words in text', fontsize=12)
plt.title("Number of words by ease in reading", fontsize=15)
plt.show()

#### No. of punctuations by easy of reading

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x='is_easy_excerpt', y='num_punctuations', data=df)
plt.xlabel('Passage Easy to Read', fontsize=12)
plt.ylabel('Number of puntuations in text', fontsize=12)
plt.title("Number of punctuations by ease in reading", fontsize=15)
plt.show()

#### Exploration w.r.t. Standard Error

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=df[df['is_easy_excerpt'] == 1]['standard_error'], name='Easy Exceprt'))
fig.add_trace(go.Histogram(x=df[df['is_easy_excerpt'] == 0]['standard_error'], name='Tough Excerpt'))

# Overlay both histograms
fig.update_layout(barmode='stack', title_text='Standard Error Comparison between Easy and Difficult Excerpt', 
    xaxis_title_text='Standard Error Distribution', 
    yaxis_title_text='Count')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Modeling Part for Binary Classification

In [None]:
df.head(2)

#### Creating Tf-Idf vectors and EDA on the excerpt text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['excerpt_id'] = df['is_easy_excerpt'].factorize()[0]
category_id_df = df[['is_easy_excerpt', 'excerpt_id']].drop_duplicates().sort_values('excerpt_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['excerpt_id', 'is_easy_excerpt']].values)

tfidf = TfidfVectorizer(sublinear_tf=True, 
                        norm='l2', 
                        encoding='latin-1', 
                        ngram_range=(1, 2), 
                        stop_words='english')

features = tfidf.fit_transform(df.excerpt)
labels = df.is_easy_excerpt
features.shape

from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Product))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        norm='l2', 
                        encoding='latin-1', 
                        ngram_range=(3, 6), 
                        stop_words='english')

In [None]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test.head()

#### Linear Regression Model

In [None]:
features_train = tfidf.fit_transform(df.excerpt)
features_test = tfidf.transform(test.excerpt)

labels = df['target']

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

model = SVR(C=10, kernel='rbf', gamma='auto')

X_train, X_test, y_train, y_test = train_test_split(features_train, labels, test_size=0.25, random_state=0)
model.fit(X_train, y_train)
y_repeat = model.predict(X_train)
y_val = model.predict(X_test)
y_preds = model.predict(features_test)

from sklearn.metrics import mean_squared_error

print('The RMSE for validation data is:', np.sqrt(mean_squared_error(y_repeat, y_train)))
print('The RMSE for validation data is:', np.sqrt(mean_squared_error(y_val, y_test)))

# The RMSE values prove that the model is not overfitting, however it needs a lot of improvement

# test['target'] = y_preds
# test[['id', 'target']].to_csv('submission.csv', index=False)

The development part is inspired from this notebook: https://www.kaggle.com/duttadebadri/clrp-roberta-linear-svc/data

In [None]:
from sklearn.svm import SVR
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from tqdm import tqdm

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size':32,
    'max_len':512,
    'seed':23,
}

def seed_everything(seed=23):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')

train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')

train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR()
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

sample.target = svm_preds
sample.to_csv('submission.csv',index=False)