In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualization
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df_train.sample(n=4)

# **EDA**

In [None]:
df_train.drop(['url_legal', 'license', 'standard_error'], axis=1, inplace=True)
df_train.sample(n=4)

In [None]:
df_train.info()

In [None]:
df_train.describe()

## Binning the data into 4 bins to better visualize

## 

In [None]:
cut_labels = ['<25', '25<y<50', '50<y<75', '75<y<100']
df_train['bins'] = pd.qcut(
        df_train['target'],
        [0, 0.25, 0.5, 0.75, 1], 
        labels=cut_labels
                        )
df_train.bins.head(5)

In [None]:
np.unique(df_train['bins'], return_counts=True)

## Simple Histogram

In [None]:
plt.hist(df_train['target'], bins=10);

## Getting Average word length and Average Sentence length

In [None]:
def return_avg_word_length(sentence):
    letter_count = 0
    for word in sentence.split():
        letter_count += len(word)
    return letter_count/len(sentence.split())

In [None]:
return_avg_word_length('my name')

In [None]:
avg_sentence_length, avg_word_length = [] , []
for label in cut_labels:
    mask = df_train.bins == label
    num_sentences = np.sum(mask)
    num_words, avg_wl = 0 , 0
    for sentence in tqdm(df_train.excerpt[mask]):
        num_words += len(sentence)
        avg_wl += return_avg_word_length(sentence)
    avg_sentence_length.append(num_words/num_sentences)
    avg_word_length.append(avg_wl/num_sentences)

In [None]:
print(avg_sentence_length)
print(avg_word_length)

## Plotting The statistics

In [None]:
g = sns.barplot(x=cut_labels, y=avg_sentence_length);
for val, p in zip(avg_sentence_length, g.patches):
    _x = p.get_x() + p.get_width() / 2
    _y = p.get_y() + p.get_height() + 10
    g.text(_x, _y,round(val), ha='center')

In [None]:
g = sns.barplot(x=cut_labels, y=avg_word_length);
for val, p in zip(avg_word_length, g.patches):
    _x = p.get_x() + p.get_width() / 2
    _y = p.get_y() + p.get_height() + 0.1
    g.text(_x, _y,round(val, 2), ha='center')

# Constructing the model

## Trying Ridge Regression

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

skf = StratifiedKFold(n_splits=5, random_state=2, shuffle=True)

for i, (train_idx, eval_idx) in enumerate(skf.split(df_train, df_train.bins),1):
    # splitting train and validation sets
    x_train, x_val = df_train.excerpt.iloc[train_idx], df_train.excerpt.iloc[eval_idx]
    y_train, y_val = df_train.target.iloc[train_idx], df_train.target.iloc[eval_idx]
    
    # feature extraction
    vectorizer = TfidfVectorizer()
    train_features = vectorizer.fit_transform(x_train).toarray()
    val_features = vectorizer.transform(x_val).toarray()
    
    ridge_regressor = Ridge()
    
    ridge_regressor.fit(train_features,y_train)
    train_error = np.sqrt(mean_squared_error(ridge_regressor.predict(train_features), y_train))
    val_error = np.sqrt(mean_squared_error(ridge_regressor.predict(val_features), y_val))
    
    print(f'Fold {i}\nTrain error : {train_error}, Validation error : {val_error}')


## Trying Lasso Regression

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import Ridge, Lasso
# from sklearn.metrics import mean_squared_error

# skf = StratifiedKFold(n_splits=5, random_state=2, shuffle=True)

# for i, (train_idx, eval_idx) in enumerate(skf.split(df_train, df_train.bins),1):
#     # splitting train and validation sets
#     x_train, x_val = df_train.excerpt.iloc[train_idx], df_train.excerpt.iloc[eval_idx]
#     y_train, y_val = df_train.target.iloc[train_idx], df_train.target.iloc[eval_idx]
    
#     # feature extraction
#     vectorizer = TfidfVectorizer()
#     train_features = vectorizer.fit_transform(x_train).toarray()
#     val_features = vectorizer.transform(x_val).toarray()
    
#     ridge_regressor = Lasso()
    
#     ridge_regressor.fit(train_features,y_train)
#     train_error = np.sqrt(mean_squared_error(ridge_regressor.predict(train_features), y_train))
#     val_error = np.sqrt(mean_squared_error(ridge_regressor.predict(val_features), y_val))
    
#     print(f'Fold {i}\nTrain error : {train_error}, Validation error : {val_error}')


# Discussion of trials

The Ridge regression worked better as expected as the number of features in the data is way larger than the number of examples, which is why RNNs wouldn't also work as well due to lack of data and overfitting.

- Lasso Regression does heavy feature selection by making coefficients of a lot of the features in regression equal to zero (**L1 Regularization**).
- Ridge Regression applies **L2 Regularization** which applies the much needed regularization due to the number of features without the heavy feature selection.


# After training

After using the **Stratified KFold** to train and validate the data, we train the data on the whole training set before using it on the test set.

In [None]:
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
df_test.drop(['url_legal', 'license'], axis=1, inplace=True)
df_test.sample(n=3)

In [None]:
df_test.describe()

In [None]:
    x_train, x_test = df_train.excerpt, df_test.excerpt
    y_train = df_train.target
    
    vectorizer = TfidfVectorizer()
    train_features = vectorizer.fit_transform(x_train).toarray()
    test_features = vectorizer.transform(x_test).toarray()
    
    ridge_regressor = Ridge()
    
    ridge_regressor.fit(train_features,y_train)
    train_error = np.sqrt(mean_squared_error(ridge_regressor.predict(train_features), y_train))
    print(f'train error: {train_error}')
    test_pred = ridge_regressor.predict(test_features)

In [None]:
submission_df = pd.DataFrame({'id': df_test.id, 'target': test_pred})
submission_df.to_csv('submission.csv', index=False)