In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import re
from PIL import Image
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [None]:
from wordcloud import WordCloud
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import wordcloud 

In [None]:
stop_words=set(stopwords.words("english"))

In [None]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
df_sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
df_sample.head()

In [None]:
print("Train Data")
print(df_train.info())
print()
print("Test Data")
print(df_test.info())
print()
print("Sample Data")
print(df_sample.info())

In [None]:
df_train.head()

In [None]:
df_train['excerpt'].isnull().sum()

In [None]:
df_train['target'].isnull().sum()

In [None]:
if re.search('\w+:\/\/\S+', ' '.join(df_train['excerpt'])):
    print("Dataset contain hyperlinks")
else:
    print("Dataset does not contain hyperlinks")

In [None]:
if re.search('@[\S]+', ' '.join(df_train['excerpt'])):
    print("Dataset contain tags")
else:
    print("Dataset does not contain tags")

In [None]:
regx_pattern = "[^A-Za-z \t]"
regx_pattern2 = "[#$%&()*+-/:<=>@[\\]^_{|}~\t\n]"

In [None]:
def process_text(text):
    text=text.strip().lower()
    text = re.sub(regx_pattern, " ", text) #text=re.sub(regx_pattern_2, " ", text)
    tokenized_text = word_tokenize(text) #nltk.tokenize.TreebankWordTokenizer().tokenize(text)
    #tokenized_text =[t for t in tokenized_text if len(t)>3]
    #filt_text=[]
    #for word in tokenized_text:
    #     if word not in stop_words:
    #        filt_text.append(word)  #filt_text.append(lem.lemmatize(word))  
    return ' '.join(tokenized_text)

In [None]:
df_train['cleaned_excerpt'] = [process_text(text) for text in df_train['excerpt']]
df_test['cleaned_excerpt'] =  [process_text(text) for text in df_test['excerpt']]

In [None]:
df_train.head()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model

In [None]:
print("Unique tokens in unprocessed text :",len(set(" ".join(df_train['excerpt']).split())))
print("Unique tokens in processed text   :",len(set(" ".join(df_train['cleaned_excerpt']).split())))

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoModelForMaskedLM, TFDistilBertForSequenceClassification, DistilBertTokenizerFast, BertTokenizer, BertTokenizerFast, TFBertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('../input/bertbaseuncased/bert-base-uncased')

In [None]:
X_train_final = list(df_train['cleaned_excerpt'])
y_train_final = list(df_train['target'])

In [None]:
train_encodings_final = tokenizer(X_train_final, 
                                truncation=True, 
                                padding=True)

In [None]:
train_dataset_final = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings_final),
    y_train_final
))

In [None]:
train_dataset_final = train_dataset_final.shuffle(len(X_train_final)).batch(16)

In [None]:
final_model = TFBertForSequenceClassification.from_pretrained('../input/bertbaseuncased/bert-base-uncased', 
                                                                  num_labels=1)

In [None]:
final_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
    loss=tf.keras.losses.MeanSquaredError(name = 'mse'),
    metrics = [tf.keras.metrics.RootMeanSquaredError(name = 'rmse')],
)

In [None]:
final_history = final_model.fit(train_dataset_final, 
                                epochs = 8) #epochs = 35)

In [None]:
X_test_final = list(df_test['cleaned_excerpt'])

In [None]:
test_encodings_final = tokenizer(X_test_final, 
                                 truncation=True, 
                                 padding=True)

In [None]:
test_dataset_final = tf.data.Dataset.from_tensor_slices(
    (dict(test_encodings_final),
     #np.zeros(shape=(len(X_test_final),))
    ))

In [None]:
sample_pred = final_model.predict(test_dataset_final.batch(1))

In [None]:
df_pred = pd.DataFrame(sample_pred.logits,columns=['target'])
df_id = pd.DataFrame(df_test['id'], columns=['id'])
df_submission = pd.concat([df_id,df_pred], axis =1)
df_submission

In [None]:
df_submission.to_csv('submission.csv', index = False)