<h1 id="heading">

<a class="anchor-link" href="https://www.kaggle.com/deb009/commonlit-readability-prize-eda/notebook#heading">¶</a>
</h1>

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("your-secret-label")

In [None]:
!pip install textfeatures

In [None]:
!pip install textstat

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import datasets
from sklearn import model_selection
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import missingno as msno
import re
import string
from pprint import pprint
import textstat
import textfeatures
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_submision = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')


# Data Exploration

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submision.head()

In [None]:
train.dtypes

In [None]:
#train = train.drop(["url_legal","license"], axis=1)
#test = test.drop(["url_legal","license"], axis=1)

# Missing values

<font size="3">I am using [missingno](http://pypi.org/project/missingno/) library for visualizing missing values.</font>


In [None]:
msno.bar(train, sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

# Visualization

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(train['target'])
plt.title('Target distribution')
plt.show()
print(train.target.describe())

In [None]:
plt.figure(figsize=(12,5))
plt.xlim(-5,2)
plt.ylabel('target')
sns.boxplot(x=train['target'])
plt.show()


<font size="3">We can infer from the above graphs that :</font>
1. Most of the values are less then 0.
2. There are no outliers in the column.

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(train['standard_error'])
plt.title('Target distribution')
plt.show()
print(train.standard_error.describe())

In [None]:
plt.figure(figsize=(12,5))
plt.xlim(-0.05,0.7)
plt.ylabel('standard_error')
sns.boxplot(x=train['standard_error'])
plt.show()

<font size="3">It looks like standard error column has outliers around 0 and more than 0.5.</font>


In [None]:
sns.jointplot(x=train['target'], y=train['standard_error'], kind='hex',height=8)
plt.suptitle("Target vs Standard error")
plt.subplots_adjust(top=0.94)
plt.show()

<font size="3">We will try to check the values for excerpt when the target is lowest and highest.</font>

Excerpt Values for Lowest Target Values

In [None]:
sort_by_target = train.sort_values(['target'])
sort_by_target_lowest = sort_by_target[['excerpt','target']].head(5)
for label, row in sort_by_target_lowest.iterrows():
    print(row["excerpt"][:400])
    print(row["target"])

Excerpt Values for Highest Target Values

In [None]:
sort_by_target_highest = sort_by_target[['excerpt','target']].tail(5)
for label, row in sort_by_target_highest.iterrows():
    print(row["excerpt"][:400])
    print(row["target"])

<font size="3"> So, from the above two cells we are able to figure out that the target is inversly propotional to complexity of the sentences</font>

# Wordcloud

<font size="3">Wordcloud for target lowest till 5 rows each</font>

In [None]:
text = " ".join(excerpt for excerpt in sort_by_target_lowest.excerpt)
# Create stopword list:
stopwords = set(STOPWORDS)


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

<font size="3">Wordcloud for highest target till 5 rows each</font>

In [None]:
text = " ".join(excerpt for excerpt in sort_by_target_highest.excerpt)
# Create stopword list:
stopwords = set(STOPWORDS)


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Pre-Processing and Feature Engineering

In [None]:
df_train = train.copy()
df_test = test.copy()

We will add a new feature called [Dale–Chall readability formula](http://https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula).
The Dale–Chall readability formula is a readability test that provides a numeric gauge of the comprehension difficulty that readers come upon when reading a text. It uses a list of 3000 words that groups of fourth-grade American students could reliably understand, considering any word not on that list to be difficult.

You will be able to find more about the formula in the hyperlink.

We will add some other features also.

In [None]:
def feature_engineering(df):
    df['character_count'] = df['excerpt'].apply(lambda x: len(str(x)))
    df['digit_count'] = df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))
    df['word_count'] = df['excerpt'].apply(textstat.lexicon_count)
    df['unique_word_count'] = df['excerpt'].apply(lambda x: len(set(str(x).split())))
    df['mean_word_length'] = df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
    df['syllable_count'] = df['excerpt'].apply(textstat.syllable_count)
    df['sentence_count'] = df['excerpt'].apply(textstat.sentence_count)
    df['flesch_reading_ease'] = df['excerpt'].apply(textstat.flesch_reading_ease)
    df['flesch_kincaid_grade'] = df['excerpt'].apply(textstat.flesch_kincaid_grade)
    df['automated_readability_index'] = df['excerpt'].apply(textstat.automated_readability_index)
    df['coleman_liau_index'] = df['excerpt'].apply(textstat.coleman_liau_index)
    df['linsear_write_formula'] = df['excerpt'].apply(textstat.linsear_write_formula)
    df['difficult_words']= df['excerpt'].apply(lambda x: textstat.difficult_words(x))
    df['avg_sentence_length']= df['excerpt'].apply(lambda x: textstat.avg_sentence_length(x))
    df['reading_time']=df['excerpt'].apply(lambda x: textstat.reading_time(x))
    df['dc_readability_score'] = df['excerpt'].apply(lambda x: textstat.dale_chall_readability_score(x))

    return df
    
    

In [None]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [None]:
df_train.head()

In [None]:
columns = ['character_count', 'digit_count', 'word_count', 'unique_word_count',
       'mean_word_length', 'syllable_count', 'sentence_count',
       'flesch_reading_ease', 'flesch_kincaid_grade',
       'automated_readability_index', 'coleman_liau_index',
       'linsear_write_formula', 'difficult_words', 'avg_sentence_length',
       'reading_time', 'dc_readability_score']

df_temp = df_train[columns]

We will try to figure out the collinearity of the columns which we have created now.



In [None]:
fig = plt.figure(figsize=(15, 15), dpi=100)
matrix = np.triu(df_train[columns + ['target']].corr())
sns.heatmap(df_train[columns + ['target']].corr(), annot=True, mask=matrix)
plt.title('New Features and Target Correlations', size=20, pad=20)

In [None]:
def plot_feature(feature):

    fig, axes = plt.subplots(ncols=2, figsize=(32, 6))

    sns.regplot(x=df_train['target'], y=df_train[feature], line_kws={'color': 'red'}, ax=axes[0])
    sns.kdeplot(df_train[feature], fill=True, ax=axes[1])

    axes[0].set_xlabel(f'target', size=18)
    axes[0].set_ylabel(feature, size=18)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].legend(prop={'size': 15})
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=15)
        axes[i].tick_params(axis='y', labelsize=15)
    axes[0].set_title(f'target vs {feature}', size=20, pad=20)
    axes[1].set_title(f'{feature} Distribution', size=20, pad=20)

    plt.show()
    
for feature in columns:
    plot_feature(feature)

Only flesch_reading_grade and dc_readability_score has more than 50 percent coorelation with target.We will try to plot each column with target and see there distribution.

# Pre-Processing

clean() function of textfeatures can be used to clean the document.

In [None]:
textfeatures.clean(df_train,"excerpt","clean_excerpt")
textfeatures.clean(df_test,"excerpt","clean_excerpt")

In [None]:
#df_train['clean_excerpt']=df_train['clean_excerpt'].apply(lambda x: clean(x))
#df_test['clean_excerpt']=df_test['clean_excerpt'].apply(lambda x: clean(x))

Normalization

In [None]:
df_train['clean_excerpt'] = df_train['clean_excerpt'].astype(str)
df_test['clean_excerpt'] = df_test['clean_excerpt'].astype(str)

In [None]:
# Stemming
from nltk.stem.porter import PorterStemmer

def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

df_train['clean_excerpt'] = get_stemmed_text(df_train['clean_excerpt'])
df_test['clean_excerpt'] = get_stemmed_text(df_test['clean_excerpt'])

In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df_train['clean_excerpt'] = get_lemmatized_text(df_train['clean_excerpt'])
df_test['clean_excerpt'] = get_lemmatized_text(df_test['clean_excerpt'])

Tokenization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True)

train_v = vectorizer.fit_transform(df_train['clean_excerpt'])
test_v = vectorizer.transform(df_test['clean_excerpt'])

In [None]:

scaler = StandardScaler()


for col in columns:
    df_train[col] = scaler.fit_transform(df_train[col].values.reshape(-1, 1))
    df_test[col] = scaler.transform(df_test[col].values.reshape(-1, 1))

Standard Scalar

In [None]:
scaler = StandardScaler()

for col in columns:
    df_train[col] = scaler.fit_transform(df_train[col].values.reshape(-1, 1))
    df_test[col] = scaler.transform(df_test[col].values.reshape(-1, 1))

In [None]:
train_X1 = df_train[columns]
test_X1 = df_test[columns]

In [None]:
train_X_Title = hstack([train_v, csr_matrix(train_X1.values)])
test_X_Title = hstack([test_v, csr_matrix(test_X1.values)])
y1 = train['target']

In [None]:
# LinearSVR model

X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=42)

clf1 = LinearSVR(C=0.2)
clf1.fit(X_train, y_train)

y_pred1 = clf1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', sqrt(mae1))

In [None]:
pred = clf1.predict(test_X_Title)
test['target'] = pred
test.drop(['url_legal','license'],axis=1,inplace=True)
test.drop(['excerpt'],axis=1,inplace=True)
test.reset_index(drop=True, inplace=True)
test.head()
test.to_csv('output.csv')