# importing libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Read Data

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.head()

# Text Preprocessing

In [None]:
import re
import string
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def preprocessing_text(text):
    text = text.lower()
    text = re.sub(r'd+','', text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # removing spaces
    text = text.strip()
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(str(elem) for elem in filtered_text)
    # steaming
    stemmer = PorterStemmer()
    token_text = word_tokenize(text)
    for word in token_text:
        # print(stemmer.stem(word))
        text = text +' '+stemmer.stem(word)
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    input_text = word_tokenize(text)
    for word in input_text:
        text = text + '' + lemmatizer.lemmatize(word)
    return text

## Apply preprocessing on dataset

In [None]:
df['process_text'] = ''
for i in range(0,len(df)):
    df['process_text'][i] = preprocessing_text(df.excerpt[i])

In [None]:
df.head()

## seperate dependent and independent variables

In [None]:
X = df.process_text
y = df.target

# splitting dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)

# Apply TF-IDF

In [None]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_train.shape, X_train.shape, tfidf_test.shape, X_test.shape

In [None]:
type(tfidf_train)

In [None]:
# converting sparse matrix to pandas dataframe
tfidf_train_df = pd.DataFrame(tfidf_train.toarray())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray())

# Neural Network

In [None]:
# keras tuner help us to choose number of layer and neurons in that layer
# using randomsearch
!pip install -U keras-tuner

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

## Define a model-building function

In [None]:
def build_model(hp): # hp as a Hyperparameter
    model = keras.Sequential()
    for i in range(hp.Int('num_layers',2,30)): # minimum hidden layers 2 and maximum 30
        model.add(layers.Dense(units=hp.Int('units_' + str(i), # he can choose any between them
                                            min_value=20, # minimum neuron 20
                                            max_value=1000, # maximum neuron 1000
                                            step=32),
                               activation='relu'))
        model.add(layers.Dense(1,activation='linear')) # output layer only only contain 1 neuron
        model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate',[1e-2,1e-3,1e-4])), # hp.Choice we restrict his choice between fiven learning rates
                      loss=keras.losses.MeanSquaredError(),metrics=['mse'])
    return model

## You can increase executions_per_trial & max_trial to reduce RMSE

In [None]:
tuner = RandomSearch(
    build_model,
    objective='val_mse',
    max_trials=3,  # total number of trials
    executions_per_trial=3, # number of models that should be built and fit for each trial 
    directory='weights',
    project_name='commonLit'
)

In [None]:
tuner.search_space_summary()

### same as model.fit()

In [None]:
tuner.search(tfidf_train_df, y_train,
             epochs=15,
             validation_data=(tfidf_test_df, y_test))

In [None]:
tuner.results_summary()

In [None]:
# give us best top 2 models
models = tuner.get_best_models(num_models=2)

In [None]:
# we are predicting using best model
y_pred = models[0].predict(tfidf_test_df)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

## Best Model

In [None]:
# structure of model
models[0].summary()

In [None]:
from keras.models import load_model

models[0].save('CommonLit NN.h5')  # creates a HDF5 file 'my_model.h5'

# returns a compiled model
# identical to the previous one
# model = load_model('my_model.h5')