If you find this notebook helpful, please <b>UPVOTE</b>

In [None]:
import pandas as pd
import re
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

The missing values are in url_legal and license columns, which we will drop anyways, so we can fill them with anything.

In [None]:
df = df.fillna('Missing')

Now we must try to get rid of anything which is not an alphabet, anything which is a stopword and then lemmatize the words.

In [None]:
%%time
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

def cleaner(excerpt):
    clean = nltk.word_tokenize(re.sub("[^a-zA-Z]", " ", excerpt).lower())
    clean = [word for word in clean if not word in set(stopwords.words("english"))]

    lem = nltk.WordNetLemmatizer()
    clean = [lem.lemmatize(word) for word in clean]
    return " ".join(clean)

df.excerpt = df.excerpt.apply(cleaner)

Now I am going to create one extra feature by using another freely available dataset. The dataset gives us usage frequency of english words. 

So if we sum up usage frequency of all words, we end up with a score of the paragraph. This score will be more if the words in the paragraph are used more in daily life and the score will be less if they are not used often. 

When words are not used often, they surely can be hard for people to understand. 

There can be a better way to use this word frequency but I am going forward with this basic approach for now.

In [None]:
%%time
wdf = pd.read_csv('../input/english-word-frequency/unigram_freq.csv')

wdf['ncol'] = wdf.word.apply(lambda x: True if (x not in set(stopwords.words("english"))) else False)
nwdf = wdf[wdf.ncol==True]

lem1 = nltk.WordNetLemmatizer()
nwdf['lword'] = nwdf.word.apply(lambda x: lem1.lemmatize(str(x)))
nwdf = nwdf.sort_values('count')

nwdf['scaled_count'] = nwdf['count'] 
word_freq = dict(zip(nwdf.word, nwdf.scaled_count))

def get_score(excerpt):
    score = 0

    for i in excerpt.split(' '):
        try:
            score += word_freq[i]
        except KeyError:
            pass

    return score

We keep this score as excerpt_score and we will create another feature to take into account how long the paragraphs are, that feature is ex_len.

In [None]:
df['ex_len'] = df.excerpt.apply(lambda x: len(x))

In [None]:
print(df.ex_len.min(), df.ex_len.median())
df.excerpt = df.excerpt.apply(lambda x: x[0:586])

In [None]:
df['excerpt_score'] = df.excerpt.apply(get_score)

In [None]:
df.drop(['ex_len'], axis=1, inplace=True)

Scaling should help us get better results.

In [None]:
mms2 = MinMaxScaler()

df.excerpt_score = mms2.fit_transform(np.reshape(list(df.excerpt_score), (-1,1)))

We will be dropping some columns.

In [None]:
y = df['target']
df.drop(['id', 'url_legal', 'license', 'target', 'standard_error'], axis=1, inplace=True)

In [None]:
df.excerpt_score = df.excerpt_score.apply(lambda x: np.round(x, 2))

In [None]:
df

We will use pre-trained Word2Vec.

In [None]:
# If you are running the notebook on Colab then uncomment the code below to get the Word2Vec model

# !sudo apt install wget
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# !gzip -d GoogleNews-vectors-negative300.bin.gz

path = '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'



word2vec_model = KeyedVectors.load_word2vec_format(path, binary=True)
EMBEDDING_DIM=300

print(word2vec_model.vectors.shape)

In [None]:
def avg_feature_vector(sentence, model, num_features):
    words = sentence.split()
    feature_vec = np.zeros((num_features,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, model[word])
        except KeyError as error:
            feature_vec 
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

word2vec_train = np.zeros((len(df.index),300),dtype="float32")

for i in range(len(df.index)):
    word2vec_train[i] = avg_feature_vector(df["excerpt"][i],word2vec_model, 300)
    
print(word2vec_train.shape)
print(y.shape)


So we will be using the word vectors to encode all paragraphs and will also keep two more columns we created.

In [None]:
names_df = pd.DataFrame(data=word2vec_train)
df = pd.concat([df, names_df], axis=1)

In [None]:
df.shape

Now it is time to let go of the text column.

In [None]:
df.drop(['excerpt'], axis=1, inplace=True)

In [None]:
df

In [None]:
X = df

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse(y_test, y_pred)

In [None]:
# mse = make_scorer(mean_squared_error,greater_is_better=False)

# model = RandomForestRegressor()

# params = {
#               "max_features": [1, 3, 10],
#               "min_samples_split": [2, 3, 10],
#               "min_samples_leaf": [1, 3, 10],
#               "n_estimators" :[100, 300, 500, 1000, 1500]}


# model = GridSearchCV(model,param_grid = params, cv=3, scoring=mse, n_jobs= -1, verbose = 1)

# model.fit(X_train,y_train)

# model = model.best_estimator_

In [None]:
# y_pred = model.predict(X_test)

# from sklearn.metrics import mean_squared_error as mse
# mse(y_test, y_pred)

### Test Data - Creating Submission

We must do the same with test data, that is create two columns and use Word2Vec on the text data.

In [None]:
tdf = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

tdf = tdf.fillna('Missing')

# tdf['ex_len'] = tdf.excerpt.apply(lambda x: len(x))
tdf.excerpt = tdf.excerpt.apply(cleaner)
tdf.excerpt = tdf.excerpt.apply(lambda x: x[:586])
tdf['excerpt_score'] = tdf.excerpt.apply(get_score)

tdf.drop(['id', 'url_legal', 'license'], axis=1, inplace=True)

In [None]:
tdf

In [None]:
word2vec_test = np.zeros((len(tdf.index),300),dtype="float32")

for i in range(len(tdf.index)):
    word2vec_test[i] = avg_feature_vector(tdf["excerpt"][i],word2vec_model, 300) 

print(word2vec_test.shape)

In [None]:
tdf.drop(['excerpt'], axis=1, inplace=True)

In [None]:
tdf.excerpt_score = mms2.transform(np.reshape(list(tdf.excerpt_score), (-1,1)))
# tdf.ex_len = mms3.transform(np.reshape(list(tdf.ex_len), (-1,1)))

In [None]:
names_df = pd.DataFrame(data=word2vec_test)
tdf = pd.concat([tdf, names_df], axis=1)

tdf.shape

In [None]:
ypred = model.predict(tdf)

In [None]:
ypred

In [None]:
submission = pd.DataFrame({ 'id' : pd.read_csv('../input/commonlitreadabilityprize/test.csv')['id'], 
                           'target': list(ypred)})

submission.to_csv('submission.csv', index=False)
