In [1]:
import pandas as pd
import torch
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt
import wandb

from transformers import AlbertTokenizer, TFAlbertModel
import numpy as np
import itertools

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Data pre-processing and data loading functions
# Sample dataset with first 4 rows from the corresponding excel files


"""
Loading dataset from excel

Return:
    Data file read from Pandas    
"""

train = pd.read_excel('../dataset/Project_Datasets/Constraint_English_Train.xlsx')
train = train[0:3]
val = pd.read_excel('../dataset/Project_Datasets/Constraint_English_Val.xlsx')
val = val[0:3]
    

"""
Clean Text function

Args:
    string: Each line of tweets from the excel file
    
Output:
    The processed tweets according to cleanText criteria
"""

stops = set(stopwords.words("english"))
def cleanText(string):
    text = string.lower().split() # change all the sentences to lower case
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text) # removes link    
    text = re.sub(r"www(\S)+",' ',text) # removes link
    text = text.replace('&amp',' ') # remove &amp and replace with space
    text = re.sub(r"&",' and ',text) # replace the symbol "&" with the word "and"
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text) # removes all except regex (including emojis)
    text = text.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text


"""
Data loader function

Args:
    input_data: Dataset excel file read from Pandas
    
Output: 
    List of lists, each list is of [tweets, label]
"""  

def load_data(input_data):
    output = []
    for index, row in input_data.iterrows():
        tweets = cleanText(row[1])
        label = row[2]
        if label == "real":
            label_out = 1
        else:
            label_out = 0
        output.append([tweets, label_out])
    return output


print(load_data(train))

[['cdc currently reports 99031 deaths general discrepancies death counts different sources small explicable death toll stands roughly 100000 people today', 1], ['states reported 1121 deaths small rise last tuesday southern states reported 640 deaths', 1], ['politically correct woman almost uses pandemic excuse reuse plastic bag coronavirus nashville', 0]]


In [3]:
# Loading Train, Val data in Pandas.DataFrame format

train_data = load_data(train)
train_df = pd.DataFrame(train_data)
train_df.columns = ["tweets", "labels"]

val_data = load_data(val)
val_df = pd.DataFrame(val_data)
val_df.columns = ["tweets", "labels"]

In [14]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

"""
The following set of hyperparameters are the optimized hyperparameters
"""
model_args = ClassificationArgs()
model_args.train_batch_size = 64
model_args.learning_rate = 0.0000215
model_args.max_seq_length = 256
model_args.num_train_epochs = 42
model_args.overwrite_output_dir = True

model = ClassificationModel(
        'albert', 
        'albert-base-v2', 
        num_labels=2, 
        use_cuda=True, 
        args=model_args
    )

# Evaluating the Model

# Loading the test dataset
test = pd.read_excel('../dataset/Project_Datasets/english_test_with_labels.xlsx')
test = test[1:3]

"""
Predict function 

Input:
    input_data: The test dataset to be used for evaluation
    
Output:
    List of lists with each line in the form of [tweet, ground_truth, prediction]
        for line in the test dataset excel
"""
def predict(input_data):
    output = []
    for index, row in input_data.iterrows():        
        tweets = cleanText(row[1])
        gt_label= row[2]
        predictions, raw_outputs = model.predict([tweets])
        output.append([tweets, gt_label , predictions])
    return output

pred_output = predict(test)

# For displaying the sample prediciton results 
print("------ Evaluate predication ------")
print(" Test text - Ground truth - Prediction ")
for item in pred_output:
    print("%s - %s - %d\n" %(item[0], item[1], item[2]))


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

------ Evaluate predication ------
 Test text - Ground truth - Prediction 
alfalfa cure covid 19 - fake - 0

president trump asked would catch coronavirus donaldtrump coronavirus - fake - 0

