In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_data['len'] = train_data['excerpt'].apply(lambda x: len(x.split()))

# Parameters

In [None]:
num_words = 15000
oov_token = '<oov>'
#Define Max_len
max_len = 0
for excerpt in train_data['excerpt'].values:
    excerpt_len = len(excerpt.split())
    if excerpt_len > max_len:
        max_len = excerpt_len
#max_len = 150
padding = 'pre'
truncating = 'pre'
embedding_dim = 100

# Preprocessing

In [None]:
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False)
    text_df = text_df.replace(',', ' ').replace('.', ' ')
    return text_df.str.replace("\'s",r"s",regex=True).values
train_data["excerpt"] = prep_text(train_data["excerpt"])
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_data['excerpt'].values)

# Pre Trained Embeddings

In [None]:
#read embedding file
embedding_dic = {}
glove_file = '../input/glove6b100dtxt/glove.6B.100d.txt'
with open(glove_file, 'r') as text_file:
    for line in text_file:
        embedding_dic[line.split()[0]] = np.asarray(line.split()[1:], dtype=np.float32)
text_file.close()
word_embeddings = np.zeros((num_words, embedding_dim))
for word, index in tokenizer.word_index.items():
    emb_vector = embedding_dic.get(word)
    if emb_vector is not None:
        word_embeddings[index] = emb_vector
    if index == num_words - 1:
        break

# Data Prep

In [None]:
training_data, testing_data = train_test_split(train_data, test_size=0.2, random_state=1234)
#training_data = train_data
train_sentences = training_data['excerpt'].values
train_label = training_data['target'].values
test_sentences = testing_data['excerpt'].values
test_label = testing_data['target'].values
#Tokenize
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
#Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding=padding, truncating=truncating)
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding=padding, truncating=truncating)

# Define Model

In [None]:

def all_model(model_type):
    if model_type == 'emb':
        inputs = tf.keras.layers.Input(shape=(max_len))
        x = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[word_embeddings], \
                                      trainable=True, input_length=max_len)(inputs)
        x = tf.keras.layers.GlobalAveragePooling1D()(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(1024, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.1)(x)
        out = tf.keras.layers.Dense(1)(x)
        #out = tf.keras.layers.Lambda(lambda a : a + (a/abs(a))*0.03)(x)
    if model_type == 'rnn':
        inputs = tf.keras.layers.Input(shape=(max_len))
        x = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim,weights=[word_embeddings], \
                                      trainable=True, input_length=max_len)(inputs)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2, return_sequences=False))(x)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        #x = tf.keras.layers.Dense(1024, activation='relu')(x)
        #x = tf.keras.layers.Dropout(0.2)(x)
        out = tf.keras.layers.Dense(1)(x)
    model = tf.keras.models.Model(inputs=inputs, outputs=out)
    return model
model = all_model('emb')
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss='mse', metrics=tf.keras.metrics.RootMeanSquaredError())
#model.summary()

# Train Model

In [None]:
history = model.fit(train_padded, train_label, validation_data=(test_padded, test_label), batch_size=64, epochs=5)
#history = model.fit(train_padded, train_label, batch_size=64, epochs=5)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])

# Feature Extractor Model

In [None]:
#Define Feature Extractor
for layer in model.layers:
    layer.trainable = False
out_put = model.layers[5].output
feature_extractor = tf.keras.models.Model(inputs=[model.inputs], outputs=[out_put])
#Create Feature
features = feature_extractor.predict(train_padded)
df_features = pd.DataFrame(data=features)
df_features['target'] = train_label
#df_features.to_csv('features_from_emb.csv', index=False)

#Create validation data
val_features = feature_extractor.predict(test_padded)
df_features_val = pd.DataFrame(data=val_features)
df_features_val['target'] = test_label

#Feature Selection
#Remove highly correlated features
def get_correlated_features(data, thld):
    data = data.drop(columns=['target'], axis=1)
    df = data.corr()
    all_columns = df.columns
    highly_correlated = [] 
    for index, row in df.iterrows():
        for col in all_columns:
            if index != col and abs(row[col]) > thld:
                highly_correlated.append(col)
    return list(set(highly_correlated))

highly_correlated_features = get_correlated_features(df_features, 0.8)
df_features = df_features.drop(columns=highly_correlated_features, axis=1)

#Slection using feature importance
reg = RandomForestRegressor()
X = df_features.drop(columns=['target'], axis=1)
y = df_features['target']
reg.fit(X, y)
arr = reg.feature_importances_
FEATURE_COLUMNS = list(np.array(X.columns[arr > 0]))
FEATURE_COLUMNS.append('target')

In [None]:
#Training
train = df_features[FEATURE_COLUMNS]
train_x = train.drop(columns=['target'], axis=1)
train_y = train['target']
#Validation
val_data = df_features_val[FEATURE_COLUMNS]
val_x = val_data.drop(columns=['target'], axis=1)
val_y = val_data['target']

**Model**

In [None]:
#Ridge
ridge_model = Ridge(alpha=3.5, random_state=1234)
ridge_model.fit(train_x, train_y)
print("Training:", mean_squared_error(train_y, ridge_model.predict(train_x), squared=False))
print("Validation:", mean_squared_error(val_y, ridge_model.predict(val_x), squared=False))

In [None]:
#RandomForest
rf_model = RandomForestRegressor(n_estimators=40, max_samples=.4, max_features=15, max_depth=5, random_state=1234)
rf_model.fit(train_x, train_y)
print("Training:", mean_squared_error(train_y, rf_model.predict(train_x), squared=False))
print("Validation:", mean_squared_error(val_y, rf_model.predict(val_x), squared=False))

In [None]:
#XgBoost
xgb_model = xgboost.XGBRegressor(n_estimators=120, eta=0.05, max_depth=3, subsample=0.3, colsample_bytree=0.3, random_state=1234)
xgb_model.fit(train_x, train_y)
print("Training:", mean_squared_error(train_y, xgb_model.predict(train_x), squared=False))
print("Validation:", mean_squared_error(val_y, xgb_model.predict(val_x), squared=False))

In [None]:
#SVR MODEL
svr_model = SVR()
svr_model.fit(train_x, train_y)
print("Training:", mean_squared_error(train_y, svr_model.predict(train_x), squared=False))
print("Validation:", mean_squared_error(val_y, svr_model.predict(val_x), squared=False))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer = TfidfVectorizer(stop_words='english', max_features=4000)
TfidfVectorizer.fit(train_sentences)
train_tfidf = TfidfVectorizer.transform(train_sentences)
val_tfidf = TfidfVectorizer.transform(test_sentences)
#Ridge
tfidf_model = Ridge(alpha=3.5, random_state=1234)
tfidf_model.fit(train_tfidf, train_label)
print("Training:", mean_squared_error(train_label, tfidf_model.predict(train_tfidf), squared=False))
print("Validation:", mean_squared_error(test_label, tfidf_model.predict(val_tfidf), squared=False))

**Model Stacking**

In [None]:
#Model stacking
ridge_model_prediction = ridge_model.predict(val_x)
rf_model_prediction = rf_model.predict(val_x)
xgb_model_prediction = xgb_model.predict(val_x)
svr_model_prediction = svr_model.predict(val_x)
tfidf_model_prediction = tfidf_model.predict(TfidfVectorizer.transform(test_sentences))
predictions = (ridge_model_prediction + rf_model_prediction + xgb_model_prediction + svr_model_prediction + 
               tfidf_model_prediction)/5
print("Validation:", mean_squared_error(val_y, predictions, squared=False))

# submission

In [None]:
test_data =  pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_data["excerpt"] = prep_text(test_data["excerpt"])
ids = test_data['id'].values
val_excerpt = test_data['excerpt'].values
val_sequences = tokenizer.texts_to_sequences(val_excerpt)
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding=padding, truncating=truncating)
val_features = feature_extractor.predict(val_padded)
df_val_features = pd.DataFrame(data=val_features)
test_x = df_val_features[FEATURE_COLUMNS[0:-1]]
#Model stacking
ridge_model_prediction = ridge_model.predict(test_x)
rf_model_prediction = rf_model.predict(test_x)
xgb_model_prediction = xgb_model.predict(test_x)
svr_model_prediction = svr_model.predict(test_x)
tfidf_model_prediction = tfidf_model.predict(TfidfVectorizer.transform(val_excerpt))
predictions = (ridge_model_prediction + rf_model_prediction + xgb_model_prediction + svr_model_prediction + 
              tfidf_model_prediction)/5
#Create DataFrame
submission_df = pd.DataFrame(data=ids, columns=['id'])
submission_df['target'] = predictions
submission_df.to_csv('./submission.csv', index=False)