In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import spacy
import numpy as np
import pandas as pd

In [None]:
df_train = pd.read_csv('/kaggle/input/readability-formatted/new_train.csv')
df_test = pd.read_csv('/kaggle/input/readability-formatted/new_test.csv')

# Data Visualisation and Exploratory Data Analysis:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

sns.set_style('whitegrid')

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_train[['word count', 'sent count', 'difficult word count','average number of syllables', 'average length of a sentence', 'target']].corr(), annot=True)

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train['word count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train[df_train['target'] < 0]['word count'])
sns.kdeplot(df_train[df_train['target'] >= 0]['word count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train['sent count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train[df_train['target'] < 0]['sent count'])
sns.kdeplot(df_train[df_train['target'] >= 0]['sent count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train['difficult word count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train[df_train['target'] < 0]['difficult word count'])
sns.kdeplot(df_train[df_train['target'] >= 0]['difficult word count'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train['average length of a sentence'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train[df_train['target'] < 0]['average length of a sentence'])
sns.kdeplot(df_train[df_train['target'] >= 0]['average length of a sentence'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train['average number of syllables'])

In [None]:
plt.figure(figsize=(10,2))
sns.kdeplot(df_train[df_train['target'] < 0]['average number of syllables'])
sns.kdeplot(df_train[df_train['target'] >= 0]['average number of syllables'])

# model 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model = Sequential()

model.add(Dense(5, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))


model.add(Dense(16, activation = 'relu'))
model.add(Dense(30, activation = 'relu'))

model.add(Dense(16, activation = 'relu'))
model.add(Dense(30, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(15, activation = 'relu'))
model.add(Dense(30, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(15, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(8, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(4, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(2, activation = 'relu'))
model.add(Dense(1))

In [None]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
X, y = df_train[df_train.columns[3:-1]].values, df_train[df_train.columns[-1]].values

<h4> training phase: </h4>

In [None]:
stop = EarlyStopping(patience=25, monitor='val_loss')
callback_list = [stop]

model.fit(X, y, epochs=500, validation_split=0.33, callbacks=callback_list)

In [None]:
preds = model.predict(df_test[['word count', 'sent count', 'difficult word count','average number of syllables', 'average length of a sentence']].values)

In [None]:
preds = pd.DataFrame(preds, columns=['target'])

In [None]:
submission = pd.concat([df_test['id'], preds], axis=1, ignore_index=True)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

this is a basic model that uses numerical parameters extracted from the excerpt that can be used to predict the readability index. it can be improved further using other pretrained models and using the actual vectorised text. my next plan is to use a branched network that utilises a convolutional network for the text excerpt and this model for the metrics. then concatenate these to predict the target.