In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_raw = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_raw = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train_raw.head()

The reading ease may be related to below few factors:
* Average Words per Sentence (AWS): the no. of words are devided by the no. of sentences in an excerpt
* Average Syllables per Word (ASW): the no. of syllables are devided by the no. of words in an excerpt
* Word difficulty: the less frequent used words are easier
* Syllable difficulty: no. of syllables in a word greater than 2 is regarded as a difficult word


In [None]:
# split the excerpt to sentenses
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [None]:
# syllables in a word
def syllables_count(word):
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

In [None]:
# no. of sentences in an excerpt
def noOfSentences(text):
    return len(split_into_sentences(text))

In [None]:
# no. of words in an excerpt
def noOfWords(text):
    count = len(re.findall("[a-zA-Z_]+", text))
    return count

In [None]:
test_raw['target'] = ''

In [None]:
test_raw[['id','excerpt','target']]

In [None]:
train_raw[['id','excerpt','target']]

In [None]:
# union test and train data for later data transformation
train_raw = pd.concat([test_raw[['id','excerpt','target']], train_raw[['id','excerpt','target']]])

In [None]:
train_raw['No_Of_Words'] = train_raw.excerpt.apply(lambda x: noOfWords(x))

In [None]:
train_raw['No_Of_Sentences'] = train_raw.excerpt.apply(lambda x: noOfSentences(x))

In [None]:
train_raw['No_Of_Syllables'] = train_raw.excerpt.apply(lambda x: syllables_count(x))

In [None]:
# calculate the syllables difficulty
syllables_morethan_2 = [[syllables_count(x)>2 for x in excerpt.split()] for excerpt in train_raw.excerpt]
train_raw['syllables_difficulty'] = [sum(x)/len(x) for x in syllables_morethan_2]

In [None]:
train_raw.head(10)

In [None]:
# transform each word to a number according to Tokenizer
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(train_raw.excerpt)

except_tokenized = tokenizer.texts_to_sequences(train_raw.excerpt)

In [None]:
# calculate the word difficulty according the index of word while tokenization
train_raw['word_difficulty'] = [sum(x)/len(x) for x in except_tokenized]

In [None]:
train_raw.tail()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_vec = tfidf.fit_transform(train_raw.excerpt)
# train_raw[tfidf.get_feature_names()] = tfidf_vec.toarray()

In [None]:
# # rename target to target in order to avoid conflicting with target word in excerpt
# train_raw.rename(columns={'target':'target'}, inplace=True)
# train_raw.head()

In [None]:
# np.shape(tfidf_vec.toarray())

In [None]:

# len(tfidf.get_feature_names())

In [None]:
train_raw = train_raw.reset_index(drop=True)

In [None]:
word_vec = pd.DataFrame(tfidf_vec.toarray(), columns=tfidf.get_feature_names())
word_vec.rename(columns={'id':'id_word','target':'target_word'}, inplace=True)
train_raw = pd.concat([train_raw, word_vec], axis=1)

In [None]:
train = train_raw.drop('excerpt',axis=1)

In [None]:
# calculate Average Words per Sentence (AWS)
train['AWS'] = train.No_Of_Words/train.No_Of_Sentences

In [None]:
# calculate Average Syllables per Word (ASW)
train['ASW'] = train.No_Of_Syllables/train.No_Of_Words

In [None]:
train.tail()

In [None]:
# drop unnecessary columns
train = train.drop(['No_Of_Words','No_Of_Syllables','No_Of_Sentences'],axis=1)

In [None]:
train.head(10)

In [None]:
# apply standard scaler to make all the features to a number btw -1 and 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train[['AWS','ASW','word_difficulty','syllables_difficulty']] = scaler.fit_transform(train[['AWS','ASW','word_difficulty','syllables_difficulty']])

In [None]:
train.head(10)

In [None]:
# test = train.iloc[0:7, :]
# train_model = train.iloc[8:, :]


In [None]:
test = train[train.target == '']
train_model = train[train.target != '']

In [None]:
# split train data to train and validation data for the sake of cross validation
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(train_model.drop(['id','target'],axis=1), 
                                                train_model.target, test_size=0.2, random_state=0)


In [None]:
ytrain.head()

In [None]:
# select model and train the model

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

gb_model = GradientBoostingRegressor(n_estimators=133,  min_samples_leaf=15,subsample=0.83,
                          max_depth=4,learning_rate=0.299,max_features='sqrt', 
                          min_samples_split=160,random_state=0)
gb_model.fit(Xtrain, ytrain)
y_pred1 = gb_model.predict(Xtest)
mean_squared_error(y_pred1, ytest)

In [None]:
from sklearn.svm import LinearSVR
model_svr = LinearSVR()
model_svr.fit(Xtrain, ytrain)
y_pred1 = model_svr.predict(Xtest)
mean_squared_error(y_pred1, ytest)

In [None]:
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score


models = {'Elastic Net': ElasticNet(),
          'Lasso': Lasso(),
          'LinearRegression': LinearRegression(),
          'MLPRegressor': MLPRegressor(),
          'Ridge': Ridge(),
          'LinearSVR': LinearSVR(),
          'RandomForest': RandomForestRegressor(),
#           'XGBoost': XGBRegressor(n_estimators=10,eta=0.1, tree_method='hist'), 
          'SVR': SVR(kernel = 'rbf')}

In [None]:
mean_squared_errors = []
r2_scores = []
for m in models.values():
    print(m)
    m.fit(Xtrain,ytrain)
    preds = m.predict(Xtest)
    mean_squared_errors.append(mean_squared_error(ytest,preds))
    r2_scores.append(r2_score(ytest,preds))

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(12,5))
plt.plot(models.keys(),mean_squared_errors,color='r',marker='o')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error by Model')
plt.show()
print(mean_squared_errors)

In [None]:
plt.figure(figsize=(12,5))
plt.plot(models.keys(), r2_scores, color='b', marker='*')
plt.ylabel('Coeff. of determination (R2 Score)')
plt.title('R2 score by Model')
plt.show()

print(r2_scores)

In [None]:
# Best model

model = Ridge(alpha=0.6)
model.fit(Xtrain, ytrain)
# print(model.alpha_)
y_pred1 = model.predict(Xtest)
mean_squared_error(y_pred1, ytest)

In [None]:
# predict test data using gb_model
test_pred = model.predict(test.drop(['id','target'],axis=1))
test_pred

In [None]:
submission = test

In [None]:
submission['target'] = np.round(test_pred,6)

In [None]:
submission = submission[['id','target']]

In [None]:
submission

In [None]:
# save to submission.csv file
submission.to_csv('submission.csv', index=False)