# Import Dependencies

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

!pip install bs4 # for handling html and tags
!pip install contractions # for handling contractions
!pip install textsearch

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Dropout 
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
import tensorflow.keras.backend as k
from keras import models

In [None]:
print("Tensorflow version " + tf.__version__)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

# Loading the data

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
train_data.head()

In [None]:
train_data.shape

# Preprocessing

In [None]:
from bs4 import BeautifulSoup
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def remove_special_chars(text, remove_digits=True):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

from nltk.corpus import stopwords
', '.join(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
import contractions
train_data['excerpt'] = train_data['excerpt'].apply(contractions.fix)
test_data['excerpt'] = test_data['excerpt'].apply(contractions.fix)

In [None]:
from nltk.stem import PorterStemmer
train_data['excerpt'] = train_data['excerpt'].apply(PorterStemmer().stem)
test_data['excerpt'] = test_data['excerpt'].apply(PorterStemmer().stem)


from nltk.stem import WordNetLemmatizer
train_data['excerpt'] = train_data['excerpt'].apply(WordNetLemmatizer().lemmatize)
test_data['excerpt'] = test_data['excerpt'].apply(WordNetLemmatizer().lemmatize)

In [None]:
train_data.head()

In [None]:
maxlen_ = 200
max_words = 20000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(train_data['excerpt'])
sequences = tokenizer.texts_to_sequences(train_data['excerpt'])
train_data_preped = pad_sequences(sequences, maxlen=maxlen_, padding='post')
word_index = tokenizer.word_index

In [None]:
train_data_preped

In [None]:
train_data_preped.shape

In [None]:
train_data['target'].shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data_preped,
                                                  train_data['target'],
                                                  test_size=0.15)
print('Size of Train: ',X_train.shape)
print('Size of Validation: ',X_val.shape)

In [None]:
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_val = mm.transform(X_val)

In [None]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

def rmse(y_true, y_pred):
        return k.sqrt(k.mean(k.square(y_pred - y_true)))
    
# Callbacks
rop = ReduceLROnPlateau(min_lr=0.00000001, patience=5)
mc = ModelCheckpoint('model1.h5', save_freq='epoch')

# Earlystopping
early_stopping = EarlyStopping(patience=10, monitor='val_loss')

# Model Building

In [None]:
with strategy.scope(): 
    inp = Input(maxlen_)
    x = Embedding(max_words, 300)(inp)
    x.trainable = True
    x = Dropout(0.4)(x)
    x = Bidirectional(LSTM(512,return_sequences=True))(x)
    x = Bidirectional(LSTM(1024,return_sequences=True))(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(1024,activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(512,activation='relu')(x)
    out = Dense(1, activation='linear')(x)
    
    model = Model(inp,out)
    
    model.compile(loss=rmse, optimizer=RMSprop(0.01))

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    batch_size = BATCH_SIZE, 
                    epochs=100,
                    validation_data = (X_val, y_val), 
                    callbacks=[rop, mc, early_stopping])

In [None]:
# loss = history.history['loss']
# val_loss = history.history['val_loss']
# epochs = range(1, len(loss) + 1)

# plt.figure(figsize=(10,5))
# plt.plot(epochs, loss, 'bo', label='Training loss')
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
# plt.title('Training and validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid()
# plt.show()

# Inference

In [None]:
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
maxlen_ = 200
max_words = 20000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(test_data['excerpt'])
sequences = tokenizer.texts_to_sequences(test_data['excerpt'])
test_data_preped = pad_sequences(sequences, maxlen=maxlen_, padding='post')
word_index = tokenizer.word_index

In [None]:
test_data.head()

In [None]:
test_datax = mm.transform(test_data_preped)

In [None]:
model = models.load_model('model1.h5', custom_objects={'rmse': rmse})
preds = model.predict(test_datax)

# Submission

In [None]:
predictions = pd.DataFrame()
predictions['id'] = test_data['id']
predictions['target'] = preds

In [None]:
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions