In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading train and test data

In [None]:
train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

## Populating GLoVe embeddings

In [None]:
EMBEDDING_FILE = '../input/glove6b100dtxt/glove.6B.100d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

## Define function to obtain feature vector for each datapoint

In [None]:
def get_feature_vectors(sentence):
    words = sentence.split()
    feature_vec = np.zeros((100,),dtype="float32")
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, embeddings.get(word))
        except:
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

## Separating and pre-processing data 

In [None]:
excerpts = train_data["excerpt"]
target = train_data["target"]
test_excerpts = test_data["excerpt"]

In [None]:
excerpts = excerpts.str.lower()
test_excerpts = test_excerpts.str.lower()

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
excerpts = excerpts.apply(ps.stem)
test_excerpts = test_excerpts.apply(ps.stem)

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
excerpts = excerpts.apply(wnl.lemmatize)
test_excerpts = test_excerpts.apply(wnl.lemmatize)

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

excerpts = excerpts.apply(lambda text: remove_stopwords(text))
test_excerpts = test_excerpts.apply(lambda text: remove_stopwords(text))

In [None]:
excerpts_full = []
for index, value in excerpts.items():
    excerpts_full.append(value)

In [None]:
y_full = []
for index, value in target.items():
    y_full.append(value)

In [None]:
excerpts_test = []
for index, value in test_excerpts.items():
    excerpts_test.append(value)

## Splitting train and validation sets

In [None]:
from sklearn.model_selection import train_test_split
excerpts_train, excerpts_val, y_train, y_val = train_test_split(excerpts_full, y_full, test_size=0.20)

## Obtain the feature vectors for each dataset as NumPy array

In [None]:
train_vectors = np.array([get_feature_vectors(sentence) for sentence in excerpts_train])
val_vectors = np.array([get_feature_vectors(sentence) for sentence in excerpts_val])
test_vectors = np.array([get_feature_vectors(sentence) for sentence in excerpts_test])
y_train = np.array(y_train)
y_val = np.array(y_val)
full_vectors = np.array([get_feature_vectors(sentence) for sentence in excerpts_full])
y_full = np.array(y_full)

## Neural network model 

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
model = keras.Sequential([
    layers.Dense(units=64, activation='relu', input_shape=[100]),
    layers.Dense(units=32, activation='relu'),
    layers.Dense(units=8, activation='relu'),
    # the linear output layer 
    layers.Dense(units=1),
])

In [None]:
# Define callbacks
early_stopping = callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=100)

In [None]:
model.compile(
    optimizer=Adam(lr=0.01), 
    loss='mae',
)

In [None]:
history = model.fit(
    train_vectors, y_train,
    validation_data=(val_vectors, y_val),
    batch_size=64,
    epochs=50,
    callbacks=[early_stopping, rlrop]
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()

In [None]:
from sklearn.metrics import mean_absolute_error
val_predictions_nn = model.predict(val_vectors)
nn_val_mae = mean_absolute_error(y_val,val_predictions_nn)
print("Validation MAE for Random Forest Model: {}".format(nn_val_mae))

## Random forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train_vectors,y_train)

In [None]:
val_predictions_rf = rf_model.predict(val_vectors)
rf_val_mae = mean_absolute_error(y_val,val_predictions_rf)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

## XGB Regressor

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Define the model
xgb_model = XGBRegressor(n_estimators = 500, learning_rate=0.05)

# Fit the model
xgb_model.fit(train_vectors,y_train)

In [None]:
# Get predictions for the validation set
val_predictions_xgb = xgb_model.predict(val_vectors)

# Calculate MAE
xgb_val_mae = mean_absolute_error(val_predictions_xgb,y_val) 
print("Validation MAE for XGB Model: {}".format(xgb_val_mae))
print(val_predictions_xgb[0])

## Check validation score with a combination of the three models

In [None]:
ensemble_preds = []
for i in range(len(val_predictions_xgb)):
    ensemble_preds.append(float((val_predictions_nn[i][0] + val_predictions_rf[i] + val_predictions_xgb[i])/3))

In [None]:
# Calculate MAE
ensemble_val_mae = mean_absolute_error(ensemble_preds,y_val) 
print("Validation MAE for Ensemble Model: {}".format(ensemble_val_mae))

## Re-train models on full data

In [None]:
model.fit(
    full_vectors, y_full,
    batch_size=64,
    epochs=50,
    callbacks=[early_stopping, rlrop]
)

In [None]:
rf_model.fit(full_vectors,y_full)

In [None]:
xgb_model.fit(full_vectors,y_full)

## Obtain test predictions

In [None]:
test_preds = rf_model.predict(test_vectors) #model.predict(test_vectors) for NN, rf_model.predict(test_vectors) for RF
test_preds = test_preds.flatten()

In [None]:
ensemble_test_preds = []
nn_preds = model.predict(test_vectors)
rf_preds = rf_model.predict(test_vectors)
xgb_preds = xgb_model.predict(test_vectors)
for i in range(len(rf_preds)):
    ensemble_test_preds.append(float((nn_preds[i][0] + rf_preds[i] + xgb_preds[i])/3))

## Prepare submission format

In [None]:
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sample_submission.target = ensemble_test_preds #test_preds 
sample_submission.to_csv('submission.csv',index=False)