# Classifiying the tweets using BERT

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import string
import tqdm
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from wordcloud import WordCloud
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Loading data

In [None]:
train_data = pd.read_csv("../input/tweets-with-sarcasm-and-irony/train.csv")
test_data = pd.read_csv("../input/tweets-with-sarcasm-and-irony/test.csv")

### Remove recurring tweets to prevent ambiguity

In [None]:
train_tweets=train_data['tweets'].tolist()
test_tweets=test_data['tweets'].tolist()

In [None]:
def keep_uniques(array, df):
    dels=[]
    for i in array:
        if array.count(i)>1:
            dels.append(i)
    dels=list(set(dels))
    for i in dels:
        df.drop( df[ df['tweets'] == i ].index, inplace=True)
    return df

In [None]:
train_data=keep_uniques(train_tweets, train_data)
test_data=keep_uniques(test_tweets, test_data)

In [None]:
len(train_data['tweets'].unique())

In [None]:
len(test_data['tweets'].unique())

### Dataset details

In [None]:
train_data.describe()

In [None]:
train_data = train_data.sample(frac = 1)
test_data = test_data.sample(frac = 1)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data['class'].value_counts()

Here, we see that the `regular` class has 18k tweets, which causes our dataset to be imbalanced. So we shall delete some tweets from this class

In [None]:
temp=train_data.loc[train_data['class'] == 'regular']

In [None]:
lis=temp['tweets'].tolist()

In [None]:
import random
reg_del=[]
visited=set()
for _ in range(3600):
    n=random.randint(0,18556)
    if n not in visited:
        reg_del.append(lis[n])
        
        
for i in reg_del:
    train_data.drop( train_data[ train_data['tweets'] == i ].index, inplace=True)

In [None]:
train_data['class'].value_counts()

In [None]:
test_data['class'].value_counts()

## Data Cleaning Preprocessing

In [None]:
def remove_URL(text):
    text=str(text)
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_mentions(text):
    ment = re.compile(r"(@[A-Za-z0-9]+)")
    return ment.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
train_data['clean_text'] = train_data['tweets'].apply(lambda x: remove_URL(x))
train_data['clean_text'] = train_data['clean_text'].apply(lambda x: remove_emoji(x))
train_data['clean_text'] = train_data['clean_text'].apply(lambda x: remove_html(x))
train_data['clean_text'] = train_data['clean_text'].apply(lambda x: remove_mentions(x))
train_data['clean_text'] = train_data['clean_text'].apply(lambda x: remove_punct(x))
train_data['clean_text'] = train_data['clean_text'].apply(
    lambda x: x.lower())

In [None]:
cleaned = train_data['clean_text'].tolist()

for i,text in enumerate(cleaned):
    splits = text.split()
    splits = [word for word in splits if word not in set(nltk.corpus.stopwords.words('english'))]
    cleaned[i]=' '.join(splits)
    
train_data['clean_text']=cleaned
    

In [None]:
train_data.head()

In [None]:
test_data['clean_text'] = test_data['tweets'].apply(lambda x: remove_URL(x))
test_data['clean_text'] = test_data['clean_text'].apply(lambda x: remove_emoji(x))
test_data['clean_text'] = test_data['clean_text'].apply(lambda x: remove_html(x))
test_data['clean_text'] = test_data['clean_text'].apply(lambda x: remove_mentions(x))
test_data['clean_text'] = test_data['clean_text'].apply(lambda x: remove_punct(x))
test_data['clean_text'] = test_data['clean_text'].apply(
    lambda x: x.lower())

In [None]:
cleaned = test_data['clean_text'].tolist()

for i,text in enumerate(cleaned):
    splits = text.split()
    splits = [word for word in splits if word not in set(nltk.corpus.stopwords.words('english'))]
    cleaned[i]=' '.join(splits)
    
test_data['clean_text']=cleaned

In [None]:
test_data.head()

In [None]:
test_data = test_data.dropna()

### Basic EDA

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
sns.countplot(train_data['class'])

### Word cloud for regular class

In [None]:
from wordcloud import WordCloud
stopwords = nltk.corpus.stopwords.words('english')

plt.figure(figsize=(12,6))
text = ' '.join(train_data.clean_text[train_data['class']=='regular'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

### Word cloud for irony class

In [None]:
plt.figure(figsize=(12,6))
text = ' '.join(train_data.clean_text[train_data['class']=='irony'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

### Word cloud for sarcasm class

In [None]:
plt.figure(figsize=(12,6))
text = ' '.join(train_data.clean_text[train_data['class']=='sarcasm'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

### Word cloud for figurative class

In [None]:
plt.figure(figsize=(12,6))
text = ' '.join(train_data.clean_text[train_data['class']=='figurative'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

## BERT Embeddings

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens, all_masks, all_segments = [], [], []
    
    for text in tqdm(texts):
        # Tokenize the current text
        text = tokenizer.tokenize(text)
        # Select text only till 
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

### Get BERT Model from TFHub

In [None]:
%%time
url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(url, trainable=True)

In [None]:
# Get tokenizer
vocab_fl = bert_layer.resolved_object.vocab_file.asset_path.numpy()
lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_fl, lower_case)

### Encoding the texts

In [None]:
def encode_target(t_class):
    t_class=str(t_class)
    class_dict = {
        'irony':0,
        'sarcasm':1,
        'regular':2,
        'figurative':3
    }
    return class_dict[t_class]

In [None]:
train_data["target"] = train_data['class'].apply(lambda x: encode_target(x))
test_data["target"] = test_data['class'].apply(lambda x: encode_target(x))

In [None]:
%%time
train_input = bert_encode(train_data['clean_text'].values, tokenizer, max_len=160)
test_input = bert_encode(test_data['clean_text'].values, tokenizer, max_len=160)
train_labels = train_data['target'].values
test_labels = test_data['target'].values

### Fine tuned model

In [None]:
def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_word_ids')
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name='input_mask')
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name='segment_ids')
    # Get the sequence output
    _, seq_op = transformer([input_word_ids, input_mask, segment_ids])
    # Get the respective class token from that sequence output
    class_tkn = seq_op[:, 0, :]
    # Final Neuron (for Classification)
    op = Dense(4, activation='softmax')(class_tkn)
    # Bind the inputs and outputs together into a Model
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=op)
    
    model.compile(optimizer=Adam(1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.07,
    epochs=4,
    callbacks=[checkpoint],
    batch_size=16
)

In [None]:
scores = model.evaluate(test_input, test_labels, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))