In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read training data and drop unused columns
train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
train_data.drop(['url_legal', 'license', 'standard_error'], axis=1, inplace=True)
train_data.head(3)

In [None]:
# Read test data (for verification) and drop unused columns
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test_data.drop(['url_legal', 'license'], axis=1, inplace=True)
test_data.head(3)

In [None]:
# Define X (excerpt, or input), y (label, target or ouput) and final test
# Use X to find Y
X_train = train_data.excerpt
Y_train = train_data.target
final_test = test_data.excerpt

In [None]:
# Convert data from pd series to np array
final_test = np.array(final_test)
X_full = np.array(X_train)
Y_full = np.array(Y_train)

In [None]:
# Import ML libraries and vectorise the input of the model (X) and the final verification data
inputShape = 100
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english',max_features=inputShape,strip_accents='unicode')
x_full = vectorizer.fit_transform (X_full).toarray() #vectors_full
final_test_vector = vectorizer.transform (final_test).toarray() #real_test_vectors

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
#use callbacks to terminate training if result is not improved
from keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [None]:
# Create models and define the neurons and layers, as well as the activation, droupout, optimizer and loss function
customAdam = keras.optimizers.Adam(
    learning_rate=0.003,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-06,
    amsgrad=False,
    name="Adam",
)

def create_model():
    model = keras.Sequential([
        layers.Dense(units=1024, kernel_initializer='normal', activation='relu', input_shape=[inputShape]),
        layers.Dropout(0.1),
        layers.Dense(units=512, kernel_initializer='normal', activation='linear'),
        layers.Dropout(0.105),
        layers.Dense(units=256, kernel_initializer='normal', activation='relu'),
        layers.Dense(units=128, kernel_initializer='normal', activation='relu'),
        layers.Dropout(0.11),
        # the linear output layer 
        layers.Dense(units=1, kernel_initializer='normal', activation='linear'),
    ])
    
    model.compile(optimizer = customAdam, loss='mean_squared_error')
    
    return model

checkpoint = ModelCheckpoint("", monitor="val_loss", verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5, min_lr=1e-6, verbose=1)
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, mode='auto', restore_best_weights=True)

In [None]:
# Build Keras Regressor
model_KR = KerasRegressor(build_fn = create_model, callbacks = [early_stop, checkpoint, reduce_lr])

In [None]:
# Use x_full to predict Y_full, 20% of the data will be used in verification or computing accuracy during the training
# Shuffle the data for each epochs, 16 samples per gradient update, train 5 epochs
history = model_KR.fit(
    x_full, Y_full,
    validation_split=0.2,
    shuffle = True,
    batch_size=16,
    epochs=30
)

In [None]:
# Use the trained model to calculate the result of the test set, and store the results in to a list
pred_test = model_KR.predict(final_test_vector)
pred_test_list = [i for i in pred_test]

In [None]:
# Create and save submission file
submission = pd.DataFrame({'id' : test_data['id'], 'target' : pred_test_list})
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head(7)