In [None]:
# Some imports :)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords

import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import iplot
from wordcloud import WordCloud
from plotly.offline import iplot

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, LinearRegression

from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.style.use('classic')
sns.set_palette(sns.color_palette('winter_r'))

In [None]:
# Importing data
training_file = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_file = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
training_file.head()

In [None]:
# Split the data roughly
data = training_file[['excerpt', 'target']]
data = data.sample(frac=1).reset_index(drop=True)
excerpt, targets = training_file['excerpt'].values, training_file['target'].values

t_X, v_X = excerpt[:2750], excerpt[2750:]
t_Y, v_Y = targets[:2750], targets[2750:]

print(t_X.shape, v_X.shape)
print(t_Y.shape, v_Y.shape)

In [None]:
# Make an Sklearn pipeline for this Ridge Regression
ridge = Ridge(fit_intercept=True, normalize=False)
pipeline_ridge = make_pipeline(
    TfidfVectorizer(binary=True, ngram_range=(1, 1)),
    ridge
)

# Do training
pipeline_ridge.fit(t_X, t_Y)

# Evaluate the performance on validation set
preds = pipeline_ridge.predict(v_X)
mse_loss = mean_squared_error(v_Y, preds)

print(f"MSE Loss using Ridge and TfIdfVectorizer: {mse_loss}")

In [None]:
# Make an Sklearn pipeline for this Linear Regression
linear = LinearRegression(fit_intercept=True, normalize=False)
pipeline_linear = make_pipeline(
    TfidfVectorizer(binary=True, ngram_range=(1, 1)),
    linear
)

# Do training
pipeline_linear.fit(t_X, t_Y)

# Evaluate the performance on validation set
preds = pipeline_linear.predict(v_X)
mse_loss = mean_squared_error(v_Y, preds)

print(f"MSE Loss using Linear Regression and TfIdfVectorizer: {mse_loss}")

Adding Data Augmentation

In [None]:
negative = training_file[training_file["target"] < 0]
positive = training_file[training_file["target"] >= 0]
negative.shape, positive.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(training_file.drop('target', axis=1), training_file.target, test_size=0.3, random_state=37)


In [None]:
!pip install ../input/nlpaug0011/nlpaug-master #> /dev/null

## Data Augmentation using NLP Aug 

In [None]:
import nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

text = 'The quick brown fox jumps over the lazy dog .'

aug_w2v = naw.WordEmbsAug(
    model_type='glove', model_path='../input/glove6b300dtxt/glove.6B.300d.txt',
    action="substitute")
print("Original:")
print(text)

aug_w2v.aug_p=0.1

print("Augmented Text:")
for ii in range(5):
    augmented_text = aug_w2v.augment(text)
    print(augmented_text)

## Data Augmenting 1500 positive samples

In [None]:
from sklearn.utils import shuffle

def augment_text(df,samples=1500,pr=0.2):
    aug_w2v.aug_p=pr
    new_text=[]
    
    ##selecting the minority class samples
    df_n=df[df.target >= 0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['excerpt']
            augmented_text = aug_w2v.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'excerpt':new_text,'target':0.3})
    df=shuffle(df.append(new).reset_index(drop=True))
    return df
   
train_aug_15 = augment_text(training_file)
train_aug_15.shape



In [None]:
train_aug_15.shape

In [None]:
# Split the data roughly
data = train_aug_15[['excerpt', 'target']]
data = data.sample(frac=1).reset_index(drop=True)
excerpt, targets = train_aug_15['excerpt'].values, train_aug_15['target'].values

t_X, v_X = excerpt[:2750], excerpt[2750:]
t_Y, v_Y = targets[:2750], targets[2750:]

print(t_X.shape, v_X.shape)
print(t_Y.shape, v_Y.shape)

In [None]:
# Make an Sklearn pipeline for this Ridge Regression
import xgboost as xgb
xgboost = xgb.XGBRegressor()

# ridge = Ridge(fit_intercept=True, normalize=False)
pipeline_ridge = make_pipeline(
    TfidfVectorizer(binary=True, ngram_range=(1, 1)),
    xgboost
)

# Do training
pipeline_ridge.fit(t_X, t_Y)

# Evaluate the performance on validation set
preds = pipeline_ridge.predict(v_X)
mse_loss = mean_squared_error(v_Y, preds)

print(f"MSE Loss using Ridge and TfIdfVectorizer: {mse_loss}")

In [None]:
# Make an Sklearn pipeline for this Linear Regression
linear = LinearRegression(fit_intercept=True, normalize=False)
pipeline_linear = make_pipeline(
    TfidfVectorizer(binary=True, ngram_range=(1, 1)),
    linear
)

# Do training
pipeline_linear.fit(t_X, t_Y)

# Evaluate the performance on validation set
preds = pipeline_linear.predict(v_X)
mse_loss = mean_squared_error(v_Y, preds)

print(f"MSE Loss using Linear Regression and TfIdfVectorizer: {mse_loss}")

In [None]:
# Get the testing file
test = test_file[['id', 'excerpt']]
test_ids = test['id'].tolist()
test_text = test['excerpt'].values

# Do Predictions on testing set
test_preds_ridge = pipeline_ridge.predict(test_text)
test_preds_linear = pipeline_linear.predict(test_text)

# Form a submissions file and save it
submission = pd.DataFrame()
submission['id'] = test_ids
submission['target'] = (test_preds_ridge + test_preds_linear) / 2
submission.to_csv("submission.csv", index=None)
print("file Submitted")