In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
train_data.drop(['url_legal', 'license', 'standard_error'], axis=1, inplace=True)
train_data.head(3)

In [None]:
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
test_data.drop(['url_legal', 'license'], axis=1, inplace=True)
test_data.head(3)

In [None]:
# define X, y and test
X_train = train_data.excerpt
Y_train = train_data.target
final_test = test_data.excerpt

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.20, random_state=42)
x_train = np.array(x_train)
x_val = np.array(x_val)
y_train = np.array(y_train) #y_train
y_val = np.array(y_val) #y_test
final_test = np.array(final_test)
X_full = np.array(X_train)
Y_full = np.array(Y_train) #y_full

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english',max_features=100,strip_accents='unicode')
x_train_vector = vectorizer.fit_transform (x_train).toarray() #train_vector
x_val_vector = vectorizer.fit_transform (x_val).toarray() #test_vectors
x_full = vectorizer.fit_transform (X_full).toarray() #vectors_full
final_test_vector = vectorizer.transform (final_test).toarray() #real_test_vectors
print(x_train_vector.shape, y_train.shape)
print(x_val_vector.shape, y_val.shape)
print(final_test_vector.shape)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

In [None]:
def create_model():
    
    model = keras.Sequential([
        layers.Dense(units=512, kernel_initializer='normal', activation='relu', input_shape=[100]),
        layers.Dense(units=256, kernel_initializer='normal', activation='relu'),
        layers.Dense(units=128, kernel_initializer='normal', activation='relu'),
        layers.Dropout(0.25),
        # the linear output layer 
        layers.Dense(units=1, kernel_initializer='normal', activation='linear'),
    ])
    
    model.compile(optimizer = 'adam', loss='mean_squared_error')
    
    return model

In [None]:
# Create a KerasClassifier with best parameters
model_KR = KerasRegressor(build_fn = create_model, batch_size = 16, epochs = 50)

# Calculate the accuracy score for each fold
kfolds = cross_val_score(model_KR, x_full, Y_full, cv = 10)

#get the accuracy
print(kfolds.mean())
print('The mean accuracy:', kfolds.mean())

In [None]:
#use callbacks
from keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

checkpoint = ModelCheckpoint("", monitor="val_loss", verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5, min_lr=1e-6, verbose=1)
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, mode='auto', restore_best_weights=True)

In [None]:
history = model_KR.fit(
    x_full, Y_full,
    validation_split=0.4,
    batch_size=16,
    epochs=50,
    callbacks = [early_stop, checkpoint, reduce_lr]
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[1:, ['loss', 'val_loss']].plot()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
history_df.head()

In [None]:
# predict 
pred_test = model_KR.predict(final_test_vector)
pred_test_list = [i for i in pred_test]

In [None]:
# create submission file
submission = pd.DataFrame({'id' : test_data['id'], 'target' : pred_test_list})
submission.to_csv('submission.csv', index=False)
submission