In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading required packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

## Set seed to get the same results each time

In [None]:
np.random.seed(0)

## Load the training data

In [None]:
full_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

## Separate the text from the target- readability score

In [None]:
excerpts = full_data["excerpt"]
target = full_data["target"]

## Text processing 1: Converting to lower case

In [None]:
excerpts = excerpts.str.lower()

## Text processing 2: Performing stemming 

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
excerpts = excerpts.apply(ps.stem)

## Text processing 3: Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
excerpts = excerpts.apply(wnl.lemmatize)

## Text processing: Removing stopwords

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
excerpts = excerpts.apply(lambda text: remove_stopwords(text))

## Create a train-test split

In [None]:
excerpts_train, excerpts_val, y_train, y_val = train_test_split(excerpts, target, test_size=0.30)

## Fit count vectorizer based on the training vocabulary

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(excerpts_train)

## Transform training and test sets to count vectors

In [None]:
X_train = vectorizer.transform(excerpts_train)
X_val = vectorizer.transform(excerpts_val)
print(X_train[3].size)

In [None]:
vec = TfidfVectorizer(max_features = 5000)
X_train = vec.fit_transform(excerpts_train).toarray()
X_val = vec.transform(excerpts_val).toarray()

In [None]:
from sklearn.decomposition import PCA
principal=PCA(n_components=1000)
principal.fit(X_train)
X_train = principal.transform(X_train)
X_val = principal.transform(X_val)

## Train a random forest and linear regression models

In [None]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train,y_train)

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
dl_model = keras.Sequential([
    # the hidden ReLU layers
    layers.Dense(units=1024, activation='relu', input_shape=[5000]),
    layers.Dense(units=512, activation='relu'),
    layers.Dense(units=128, activation='relu'),
    layers.Dense(units=32, activation='relu'),
    # the linear output layer 
    layers.Dense(units=1),
])

# Define an early stopping callback
early_stopping = callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

dl_model.compile(
    optimizer='adam',
    loss='mae',
)

history = dl_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=100,
    callbacks=[early_stopping]
)

In [None]:
import catboost
from catboost import CatBoostRegressor
from catboost import Pool, cv
from sklearn.model_selection import GridSearchCV

## Prepara a pool of train and validation set
pool_train=Pool(X_train,y_train)
pool_val=Pool(X_val,y_val)
### Define a cv function to fit on data and find the optimal number of iteration keeping other parameters fixed
### Function takes input = catboost object with default params , train data ,train y data 
def modelfit(params,poolX,useTrainCV=True,cv_folds=5,early_stopping_rounds=40):
    if useTrainCV:
        cvresult = cv(params=params, pool=poolX,nfold=cv_folds,early_stopping_rounds=early_stopping_rounds,plot=True)
    return cvresult ## return dataframe for the iteration till the optimal iteration is reached

## Prepara a cv class
params={
    'loss_function':'RMSE'
}

### Object return the optimal number of trees to grow
n_est=modelfit(params,pool_train)

In [None]:
from sklearn import metrics
### Fit the model with iteration=664
cboost1=CatBoostRegressor(iterations=664,loss_function='RMSE',random_seed=123)
cboost1.fit(X_train,y_train)
#Predict training set:
train_predictions = cboost1.predict(X_train)
#Print model report:
print("\nModel Report Train")
print("Root Mean Square Error : %.4g" % metrics.mean_squared_error(y_train, train_predictions))
print("R^2 Score (Train): %f" % metrics.r2_score(y_train, train_predictions))

## Evaluate performance on validation set

In [None]:
readable_preds = rf_model.predict(X_val)
rf_val_mae = mean_absolute_error(y_val,readable_preds)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

In [None]:
readable_preds = reg.predict(X_val)
reg_val_mae = mean_absolute_error(y_val,readable_preds)
print("Validation MAE for Linear regression Model: {}".format(reg_val_mae))

In [None]:
readable_preds = dl_model.predict(X_val)
dl_val_mae = mean_absolute_error(y_val,readable_preds)
print("Validation MAE for Deep learning Model: {}".format(dl_val_mae))

In [None]:
readable_preds = cboost1.predict(X_val)
cb_val_mae = mean_absolute_error(y_val,readable_preds)
print("Validation MAE for Catboost Model: {}".format(cb_val_mae))

## Read the test dataset

In [None]:
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

## Extract the text separately

In [None]:
test_excerpts = test_data["excerpt"]

## Performing pre-processing

In [None]:
test_excerpts = test_excerpts.str.lower()
test_excerpts = test_excerpts.apply(ps.stem)
test_excerpts = test_excerpts.apply(wnl.lemmatize)
test_excerpts = test_excerpts.apply(lambda text: remove_stopwords(text))

## Transform the test data using the count vectorizer

In [None]:
#X_test = vectorizer.transform(test_excerpts)

In [None]:
X_test = vec.transform(test_excerpts).toarray()
X_test = principal.transform(X_test)

## Obtain the predictions for the test set

In [None]:
#test_preds = rf_model.predict(X_test)

In [None]:
#test_preds = reg.predict(X_test)

In [None]:
#test_preds = dl_model.predict(X_test)

In [None]:
test_preds = cboost1.predict(X_test)

## Convert to submission format

In [None]:
x_sub = test_data[["id"]].copy()
x_sub["target"] = test_preds
x_sub.to_csv('submission.csv', index = False)
x_sub