## CommonLit Readability with KerasRegressor

In [None]:
# import libraries
import gzip
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#!kaggle competitions download -c commonlitreadabilityprize

In [None]:
# get the train data
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train.head()

In this notebook we will use only excerpt (X) and target (y) columns. So at this stage we don't need other columns.

In [None]:
# get the test data
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test.head()

**Preprocessing**

In [None]:
# define X, y and test
X = train['excerpt']
y = train['target']
test_text = test["excerpt"]

In [None]:
# lower the text
X = X.str.lower()
test_text = test_text.str.lower()

In [None]:
# apply Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
X = X.apply(ps.stem)
test_text = test_text.apply(ps.stem)

In [None]:
# lemmatize the text
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('wordnet')

wnl = WordNetLemmatizer()
X = X.apply(wnl.lemmatize)
test_text = test_text.apply(wnl.lemmatize)

In [None]:
import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

X = X.apply(lambda text: remove_stopwords(text))
test_text = test_text.apply(lambda text: remove_stopwords(text))

In [None]:
X_new = []
for index, value in X.items():
    X_new.append(value)

In [None]:
y_new = []
for index, value in y.items():
    y_new.append(value)

In [None]:
real_test = []
for index, value in test_text.items():
    real_test.append(value)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.20)

In [None]:
EMBEDDING_FILE = '../input/glove6b100dtxt/glove.6B.100d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

In [None]:
# convert features into vectors 
# I took this function from this notebook 
# https://www.kaggle.com/prajittr/commonlit-readability-glove-xgb-nn-rf-ensemble

def get_feature_vectors(sentence):
    words = sentence.split()
    feature_vec = np.zeros((100,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, embeddings.get(word))
        except:
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

In [None]:
train_vectors = np.array([get_feature_vectors(sentence) for sentence in X_train])
test_vectors = np.array([get_feature_vectors(sentence) for sentence in X_test])
real_test_vectors = np.array([get_feature_vectors(sentence) for sentence in real_test])

y_train = np.array(y_train)
y_test = np.array(y_test)

print(train_vectors.shape, y_train.shape)
print(test_vectors.shape, y_test.shape)
print(real_test_vectors.shape)

In [None]:
vectors_full = np.array([get_feature_vectors(sentence) for sentence in X_new])
y_full = np.array(y_new)

**Model**

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam


In [None]:
def create_model():
    
    model = keras.Sequential([
        layers.Dense(units=512, kernel_initializer='normal', activation='linear', input_shape=[100]),
        layers.Dense(units=256, kernel_initializer='normal', activation='linear'),
        layers.Dense(units=128, kernel_initializer='normal', activation='linear'),
        layers.Dropout(0.25),
        # the linear output layer 
        layers.Dense(units=1, kernel_initializer='normal', activation='linear'),
    ])
    
    model.compile(optimizer = 'adam', loss='mean_squared_error')
    
    return model

In [None]:
model = create_model()
model.summary()

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

# Create a KerasClassifier
#model_KR = KerasRegressor(build_fn = create_model)

#define the parameters to try out
#params = {'batch_size':[16, 32, 128], 'epochs':[10, 20, 50]}

#define RandomizedSearchCV
#random_searcher = RandomizedSearchCV(model_KR, param_distributions = params, cv = KFold(5))

#fit the model
#random_searcher.fit(vectors_full, y_full)

#take a look a the results
#print(random_searcher.best_params_)
#print(random_searcher.best_score_)

#get the mean accuracy
#print('The mean accuracy:', kfolds.mean())

In [None]:
# Create a KerasClassifier with best parameters
model_KR = KerasRegressor(build_fn = create_model, batch_size = 16, epochs = 50)

# Calculate the accuracy score for each fold
kfolds = cross_val_score(model_KR, vectors_full, y_full, cv = 10)

#get the accuracy
print(kfolds.mean())
print('The mean accuracy:', kfolds.mean())

In [None]:
#use callbacks
from keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

checkpoint = ModelCheckpoint("", monitor="val_loss", verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5, min_lr=1e-6, verbose=1)
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, mode='auto', restore_best_weights=True)

In [None]:
history = model_KR.fit(
    vectors_full, y_full,
    validation_split=0.4,
    batch_size=16,
    epochs=50,
    callbacks = [early_stop, checkpoint, reduce_lr]
)

#callbacks=[reduce_lr, early_stop]

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[1:, ['loss', 'val_loss']].plot()

In [None]:
history_df.head()

In [None]:
# predict 
pred_test = model_KR.predict(real_test_vectors)

In [None]:
pred_test_list = [i for i in pred_test]
pred_test_list

In [None]:
# create submission file
submission = pd.DataFrame({'id' : test['id'], 'target' : pred_test_list})
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head(7)