In [None]:
import pandas as pd
import numpy as np

In [None]:
import tensorflow as tf

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [None]:
df.head()

In [None]:
df = df.drop(columns=['id', 'url_legal', 'license', 'standard_error'])

In [None]:
print(df.isnull().sum())

In [None]:
X = df["excerpt"]
Y = df["target"]

In [None]:
import re
def text_cleaning(text):
  # cleaned_text = re.sub(r"[,.;@#?!&$]+\*", " ", text)
  cleaned_text = " ".join(re.findall("[a-zA-Z?!,.]+", text))

  return cleaned_text

In [None]:
X = X.apply(lambda x: text_cleaning(x))

In [None]:
X[0]

In [None]:
token = tf.keras.preprocessing.text.Tokenizer(lower = True)

token.fit_on_texts(X)

X_train = token.texts_to_sequences(X)

In [None]:
MAX_SEQ_LEN = 0
for i in X_train:
  if len(i) > MAX_SEQ_LEN:
    MAX_SEQ_LEN = len(i)

print(MAX_SEQ_LEN)

In [None]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen = 215, padding = 'post', truncating='post')

In [None]:
vocab_size = len(token.word_index) + 1
vocab_size

In [None]:
from tqdm import tqdm
embedding_vector = {}
f = open('../input/glove840b300dtxt/glove.840B.300d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:], dtype = 'float32')
    embedding_vector[word] = coef

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
      embedding_matrix[i] = embedding_value

In [None]:
def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = 200, trainable = False))
    model.add(tf.keras.layers.LSTM(75))
    model.add(tf.keras.layers.Dense(1))

    return model

In [None]:
model = create_model()
model.compile(optimizer = tf.keras.optimizers.Adam(beta_1=0.95),
              loss = 'mean_squared_error',
              metrics = [tf.keras.metrics.RootMeanSquaredError()]
             )

history = model.fit(X_train, Y, batch_size = 2272, epochs = 190, validation_split = 0.2)

In [None]:
model.summary()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc = 'upper right')
plt.show()

In [None]:
test=pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
test.shape

In [None]:
X_test=test['excerpt']

In [None]:
print(X_test.isnull().sum())

In [None]:
X_test.head()

In [None]:
X_test = X_test.apply(lambda x: text_cleaning(x))

In [None]:
X_test = token.texts_to_sequences(X_test)

In [None]:
X_test=tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen = 215, padding = 'post', truncating='post')

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
test=pd.DataFrame(test,columns=['id'])


In [None]:
test['target']=0

In [None]:
test.head()

In [None]:
test.shape

In [None]:
for i in range(test.shape[0]):
    test.iloc[i,1]=y_pred[i]

In [None]:
test.head()

In [None]:
test.to_csv(r"./submission.csv",index=False)