In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
import torch
import random
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import plotly.graph_objects as go

In [None]:
!pip install tensorflow
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import re,nltk,json
from bs4 import BeautifulSoup
### ML Librarires--------------------
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import average_precision_score,roc_auc_score, roc_curve, precision_recall_curve
###-------------------------------------------
#from keras.utils.vis_utils import plot_model
np.random.seed(42)
import string, spacy,unicodedata, random
class color: # Text style
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/CM_MEMES-master/train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/CM_MEMES-master/test_data.csv')

print("Number of Training Data: ", len(train_data))
print("Number of Test Data: ", len(test_data))

In [None]:
# Encode sarcasm labels: 0 for sarcastic, 1 for non-sarcastic
train_data['label'] = train_data['label'].apply(lambda x: 0 if x == 'sarcasm' else 1)
test_data['label'] = test_data['label'].apply(lambda x: 0 if x == 'sarcasm' else 1)

# Display the updated DataFrames
print("Train data:\n", train_data.head())
print("\nTest data:\n", test_data.head())

In [None]:
'''
Text Cleaning
'''
def text_cleaning(row):
   #to remove HTML tags
  text = BeautifulSoup(row, 'html.parser').get_text()
  d = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) #This line is for removing url
  post = d.replace('\n', '')
  post = post.replace('—', ' ')
  post = post.replace('।', ' ')
  text = ''.join([c for c in post if c not in string.punctuation])
  # to remove special characters
  pattern = r'^\s*|\s\s*'
  text = re.sub(pattern, ' ', text).strip()
  # convert into lower case
  text = text.lower()
  # # Stopword
  # result = text.split()
  # text = [word.strip() for word in result if word not in stp ]
  # text =" ".join(text)

  return text

In [None]:
#Removing punctuations
train_data['cleaned'] = train_data['captions'].apply(text_cleaning)
test_data['cleaned'] = test_data['captions'].apply(text_cleaning)

In [None]:
train_data.head()

**BERT**

In [None]:
# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_data.captions.values,
    add_special_tokens=True, # set this so BERT knows when a sentence ends and begins
    return_attention_mask=True, #using a fixed input, all sentences have same dimensionality. Tells us where actual values are and where zeros are
    padding = True, # make sure to pad sentences so they are all the same length
    max_length = 256, # this is the length we want all senteces to be
    return_tensors = 'pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_data.captions.values,
    add_special_tokens=True, # set this so BERT knows when a sentence ends and begins
    return_attention_mask=True, #using a fixed input, all sentences have same dimensionality. Tells us where actual values are and where zeros are
    padding = True, # make sure to pad sentences so they are all the same length
    max_length = 256, # this is the length we want all senteces to be
    return_tensors = 'pt'
)


# get the parts from the encoding that need to train the model
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_data.label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_data.label.values)



In [None]:
dataset_train = TensorDataset(torch.tensor(input_ids_train),
                              torch.tensor(attention_masks_train),
                              labels_train)

valid_dataset = TensorDataset(torch.tensor(input_ids_test),
                              torch.tensor(attention_masks_test),
                              labels_test)

dataloader_train = DataLoader(dataset_train, batch_size=8, shuffle=True)
dataloader_test = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(train_data.label.unique()),
    output_attentions=False,
    output_hidden_states=False
)

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr= 1e-5, #2e-5 > 5e -5
    eps=1e-8
)

In [None]:
epochs = 1

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = 5 #len(dataloader_train) *epochs
)

In [None]:
def auc(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return roc_auc_score(labels_flat, preds_flat)

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

In [None]:
def evaluate(dataloader_test):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_test):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_test)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train() # put model in training mode

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                       leave = False,
                       disable = False)
    for batch in progress_bar:

        model.zero_grad() # set the gradient to zero

        # Extract tensors from the batch (assuming batch is a TensorDataset)
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # get the inputs to the model
        inputs = {
            'input_ids': input_ids,
            'attention_mask' : attention_mask,
            'labels': labels
        }
        # get the outputs
        outputs = model(**inputs)

        #BERT returns loss and logits
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward() # backpropagation

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # prevents gradient from getting too small or too big

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    torch.save(model.state_dict(), f'/content/drive/MyDrive/CM_MEMES-master/model/bertmodel.h5')
    tqdm.write('\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_value = evaluate(dataloader_test)
    val_AUC = auc(predictions, true_value)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'AUC Score: {val_AUC}')

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(train_data.label.unique()),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
pass

In [None]:
# load a pretrained model
model.load_state_dict(
    torch.load('/content/drive/MyDrive/CM_MEMES-master/model/bertmodel.h5',
    map_location = torch.device('cpu'))
)

In [None]:
_, predictions, true_vals = evaluate(dataloader_test)

In [None]:
_, train_predictions, train_true_vals = evaluate(dataloader_train)

In [None]:
from sklearn import metrics
#print(metrics.classification_report(predictions, true_vals))
pred = predictions[:,1]
pred = np.where(pred > 0, pred, 0)
pred = np.where(pred <= 0, pred, 1)

pred
print(metrics.classification_report(pred, true_vals))