# Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict
import string
import tensorflow as tf
import re
from tensorflow import keras

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import transformers
from transformers import AutoTokenizer, TFAutoModel

# Preparing the Dataset 

In [None]:
train=pd.read_csv('../input/nlp-getting-started/train.csv',index_col='id')
test=pd.read_csv('../input/nlp-getting-started/test.csv',index_col='id')
y=train['target']
train.drop(['location','keyword'],inplace=True,axis=1)
test.drop(['location','keyword'],inplace=True,axis=1)

In [None]:
#Use regex to clean the data
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def cont_rep_char(text):
    tchr = text.group(0) 
    
    if len(tchr) > 1:
        return tchr[0:2] 

def unique_char(rep, text):
    substitute = re.sub(r'(\w)\1+', rep, text)
    return substitute

train['text']=train['text'].apply(lambda x : remove_url(x))
train['text']=train['text'].apply(lambda x : remove_punct(x))
train['text']=train['text'].apply(lambda x : remove_emoji(x))
train['text']=train['text'].apply(lambda x : decontraction(x))
train['text']=train['text'].apply(lambda x : seperate_alphanumeric(x))
train['text']=train['text'].apply(lambda x : unique_char(cont_rep_char,x))

test['text']=test['text'].apply(lambda x : remove_url(x))
test['text']=test['text'].apply(lambda x : remove_punct(x))
test['text']=test['text'].apply(lambda x : remove_emoji(x))
test['text']=test['text'].apply(lambda x : decontraction(x))
test['text']=test['text'].apply(lambda x : seperate_alphanumeric(x))
test['text']=test['text'].apply(lambda x : unique_char(cont_rep_char,x))

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)
y = (np.asarray(y)).reshape(-1,1)
Y = onehot_encoder.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(train.text,Y, random_state=10, test_size=0.2, shuffle=True)

In [None]:
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)

# Prepare the Tokens

In [None]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
X_train_t = regular_encode(list(X_train), tokenizer, maxlen=512)
X_val_t = regular_encode(list(X_val), tokenizer, maxlen=512)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_t, y_train))
    .repeat()
    .shuffle(1995)
    .batch(batch_size)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val_t, y_val))
    .batch(batch_size)
    .cache()
    .prefetch(AUTO)
)

# Preparing the Model

In [None]:
def build_model(transformer, max_len=160):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(2, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
transformer_layer = TFAutoModel.from_pretrained(model_checkpoint)
model_base = build_model(transformer_layer, max_len=512)
model_base.summary()

# Training

In [None]:
n_steps = X_train.shape[0] // batch_size
model_base.fit(train_dataset,steps_per_epoch=n_steps,validation_data=valid_dataset,epochs=3)

# Predicting on new tweets

In [None]:
X_test = regular_encode(list(test.text), tokenizer, maxlen=512)
test1 = (tf.data.Dataset.from_tensor_slices(X_test).batch(batch_size))
pred = model_base.predict(test1,verbose = 0)
pred = np.argmax(pred,axis=-1)
pred = pred.astype('int16')
pred[:5]

In [None]:
res=pd.read_csv('../input/nlp-getting-started/sample_submission.csv',index_col=None)  
res['target'] = pred
res.to_csv('submission.csv',index=False)