# Twitter Sentiment Analysis

### Benchmarking different Tokenizers and Classifiers

### Importing python libraries

In [2]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem.porter import * 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split
import gensim
import torch
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from tqdm import tqdm 
from gensim.models.doc2vec import TaggedDocument
import time
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from transformers import BertModel,BertConfig,BertForMaskedLM, BertForSequenceClassification
import random
import torch.nn.functional as F


### Loading Data
We used Sentiment140 dataset from stanford

In [3]:
## update path according to your directory
data1 = pd.read_csv('/scratch/pm3140/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',header=None,names=['sentiment','id','date','query','account_id','tweet'])

pd.options.mode.chained_assignment = None  # default='warn'
data1 = data1.sample(frac = 1) ## shuffling the data
data_copy = data1 ## making a copy of the data
data_copy = data1[:1000]  ## Uncomment for selecting a subset of the dataset
data_copy.head()

Unnamed: 0,sentiment,id,date,query,account_id,tweet
369726,0,2049957854,Fri Jun 05 17:48:24 PDT 2009,NO_QUERY,hepkitten,@ryanlrussell its graduation and wedding seaso...
582096,0,2214498731,Wed Jun 17 16:49:48 PDT 2009,NO_QUERY,ICJenny,i missed adam lambert's live chat on comcast b...
1373836,4,2051364957,Fri Jun 05 21:48:38 PDT 2009,NO_QUERY,debbbbbie,i really want @ddlovato 's people to email me ...
1346806,4,2044384786,Fri Jun 05 09:16:47 PDT 2009,NO_QUERY,geonz,Just remembered FONGING. Must bring materials...
1047292,4,1957752145,Fri May 29 01:29:54 PDT 2009,NO_QUERY,darrenporter,@NikkiPilkington cool...... thanks


### Shuffling, Relabeling data. 
0 - negative tweet
1 - positive tweet

In [4]:
data_copy['sentiment'].replace({4: 1}, inplace=True) ## positive tweets are labeled 4, mapping them to 1
data_copy.head()

Unnamed: 0,sentiment,id,date,query,account_id,tweet
369726,0,2049957854,Fri Jun 05 17:48:24 PDT 2009,NO_QUERY,hepkitten,@ryanlrussell its graduation and wedding seaso...
582096,0,2214498731,Wed Jun 17 16:49:48 PDT 2009,NO_QUERY,ICJenny,i missed adam lambert's live chat on comcast b...
1373836,1,2051364957,Fri Jun 05 21:48:38 PDT 2009,NO_QUERY,debbbbbie,i really want @ddlovato 's people to email me ...
1346806,1,2044384786,Fri Jun 05 09:16:47 PDT 2009,NO_QUERY,geonz,Just remembered FONGING. Must bring materials...
1047292,1,1957752145,Fri May 29 01:29:54 PDT 2009,NO_QUERY,darrenporter,@NikkiPilkington cool...... thanks


### Data preprocessing

#### Function to remove unimportant patterns in the tweet

In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [6]:
%%time 
#Removing Twitter Handles
data_copy['clean_tweet'] = np.vectorize(remove_pattern)(data_copy['tweet'], "@[\w]*") 
#Removing Punctuations, Numbers, and Special Characters
data_copy.clean_tweet = data_copy.clean_tweet.str.replace("[^a-zA-Z#]", " ")
# Removing Short/abbrevated Words
data_copy.clean_tweet = data_copy.clean_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

CPU times: user 215 ms, sys: 85.1 ms, total: 300 ms
Wall time: 363 ms




In [7]:
## removing columns not required for classification 
data_copy = data_copy.drop(['query','date','account_id'],axis = 1) 
data_copy.head(10)

Unnamed: 0,sentiment,id,tweet,clean_tweet
369726,0,2049957854,@ryanlrussell its graduation and wedding seaso...,graduation wedding season work week runs thru ...
582096,0,2214498731,i missed adam lambert's live chat on comcast b...,missed adam lambert live chat comcast because ...
1373836,1,2051364957,i really want @ddlovato 's people to email me ...,really want people email back cause being musi...
1346806,1,2044384786,Just remembered FONGING. Must bring materials...,Just remembered FONGING Must bring materials G...
1047292,1,1957752145,@NikkiPilkington cool...... thanks,cool thanks
385859,0,2053684801,@Nanette1 I know I was gonna stop at duncan do...,know gonna stop duncan donuts cause never them...
1425895,1,2059132991,Just back from an excellent concert from the G...,Just back from excellent concert from Gardiner...
1197511,1,1984985574,Once again Minia put together a meal worthy of...,Once again Minia together meal worthy royalty ...
399163,0,2057079950,@DanaCortez Hope you get to hug him and kiss ...,Hope kiss soon
902104,1,1694365323,@FaithfulChosen I'm fine thx. Had 3 days off a...,fine days took advantage them tired Waiting pi...


#### Data normalization

In [8]:
%%time
## converting tweets into tokens
tokenized_tweet = data_copy.clean_tweet.apply(lambda x: x.split())
stemmer = PorterStemmer() 

##The Porter stemming algorithm (or 'Porter stemmer') is a process for removing the 
## commoner morphological and inflexional endings from words in English
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

## reforming tweet from tokens
for i in range(len(tokenized_tweet)):
    tokenized_tweet.iloc[i] = ' '.join(tokenized_tweet.iloc[i])    
# data_copy['clean_tweet'] = tokenized_tweet

CPU times: user 130 ms, sys: 1.85 ms, total: 132 ms
Wall time: 131 ms


In [9]:
data_copy.head(10)

Unnamed: 0,sentiment,id,tweet,clean_tweet
369726,0,2049957854,@ryanlrussell its graduation and wedding seaso...,graduation wedding season work week runs thru ...
582096,0,2214498731,i missed adam lambert's live chat on comcast b...,missed adam lambert live chat comcast because ...
1373836,1,2051364957,i really want @ddlovato 's people to email me ...,really want people email back cause being musi...
1346806,1,2044384786,Just remembered FONGING. Must bring materials...,Just remembered FONGING Must bring materials G...
1047292,1,1957752145,@NikkiPilkington cool...... thanks,cool thanks
385859,0,2053684801,@Nanette1 I know I was gonna stop at duncan do...,know gonna stop duncan donuts cause never them...
1425895,1,2059132991,Just back from an excellent concert from the G...,Just back from excellent concert from Gardiner...
1197511,1,1984985574,Once again Minia put together a meal worthy of...,Once again Minia together meal worthy royalty ...
399163,0,2057079950,@DanaCortez Hope you get to hug him and kiss ...,Hope kiss soon
902104,1,1694365323,@FaithfulChosen I'm fine thx. Had 3 days off a...,fine days took advantage them tired Waiting pi...


#### Checking if GPU is available

In [10]:
## use cuda if available
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla V100-PCIE-32GB


#### Splitting data into X and Y

In [11]:
### data(tweets) and labels
x = data_copy.clean_tweet.values
y = data_copy.sentiment.values

### Classifiers

Function to fit different ML algorithms on the vectorized data

In [12]:
def classify(x_train,x_val,y_train,y_val):
    # Logistic Regression
    start = time.time()
    lreg = LogisticRegression(solver='lbfgs') 
    lreg.fit(x_train, y_train) 
    stop = time.time()
    print('time taken:',stop-start)
    prediction = lreg.predict_proba(x_val) # predicting on the validation set 
    preds = prediction[:,1] >= 0.4 # if prediction is greater than or equal to 0.4 than 1 else 0 
    preds = preds.astype(int)

    ## Logistic Regression Accuracy
    print('Logistic regression Accuracy and F1 Score:')
    print(accuracy_score(preds,y_val)) # calculating accuracy for the validation set
    print(f1_score(preds,y_val)) # calculating f1 score for the validation set

    ## SVM
    start = time.time()
    clf_svm = SVC(kernel='rbf', C=1, probability=True,max_iter=10000)
    clf_svm.fit(x_train, y_train) 
    stop = time.time()
    print('time taken:',stop-start)
    prediction = clf_svm.predict_proba(x_val) # predicting on the validation set 
    preds = prediction[:,1] >= 0.4 # if prediction is greater than or equal to 0.4 than 1 else 0 
    preds = preds.astype(int)

    ## SVM accuracy
    print('SVM Accuracy and F1 Score:')
    print(accuracy_score(preds,y_val)) # calculating accuracy for the validation set
    print(f1_score(preds,y_val)) # calculating f1 score for the validation set


    # ### Random Forrest
    start = time.time()
    clf_rf= RandomForestClassifier(n_estimators=100)
    clf_rf.fit(x_train, y_train) 
    stop = time.time()
    print('time taken:',stop-start)
    prediction = clf_rf.predict_proba(x_val) # predicting on the validation set 
    preds = prediction[:,1] >= 0.4 # if prediction is greater than or equal to 0.4 than 1 else 0 
    preds = preds.astype(int)

    ## Random Forrest accuracy
    print('Random Forrest Accuracy and F1 Score:')
    print(accuracy_score(preds,y_val)) # calculating accuracy for the validation set
    print(f1_score(preds,y_val)) # calculating f1 score for the validation set




## Tokenizers

### TF-IDF Tokenizer

In [13]:
%%time
x_train,x_val,y_train,y_val = train_test_split(x,y,shuffle=True,test_size=0.2)
tf_idf = TfidfVectorizer(smooth_idf=False)
x_train = tf_idf.fit_transform(x_train)
x_val = tf_idf.transform(x_val)


CPU times: user 18.4 ms, sys: 760 µs, total: 19.2 ms
Wall time: 17.7 ms


#### Classification

In [14]:
classify(x_train,x_val,y_train,y_val)

time taken: 0.41712021827697754
Logistic regression Accuracy and F1 Score:
0.61
0.7022900763358778
time taken: 0.2205650806427002
SVM Accuracy and F1 Score:
0.625
0.6781115879828327
time taken: 0.22633790969848633
Random Forrest Accuracy and F1 Score:
0.565
0.5876777251184834


### Bag of words tokenizer

In [15]:
%%time
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
x_bow = bow_vectorizer.fit_transform(x)
x_train,x_val,y_train,y_val = train_test_split(x_bow,y,shuffle=True,test_size=0.2)

CPU times: user 15.6 ms, sys: 257 µs, total: 15.9 ms
Wall time: 14.5 ms


#### Classsification

In [16]:
classify(x_train,x_val,y_train,y_val)

time taken: 0.019404172897338867
Logistic regression Accuracy and F1 Score:
0.655
0.7136929460580913
time taken: 0.13352560997009277
SVM Accuracy and F1 Score:
0.585
0.6693227091633466
time taken: 0.21800565719604492
Random Forrest Accuracy and F1 Score:
0.595
0.6610878661087866


### Word2Vec Tokenizer

In [17]:
%%time
# tokenizing clean tweet
tokenized_tweet = data_copy['clean_tweet'].apply(lambda x: x.split()) 

## defining the word2vec model
model_w2v = gensim.models.Word2Vec(tokenized_tweet,vector_size=200,window=5,
            min_count=2,sg = 1,hs = 0,negative = 10,workers= 32
) 

## training the word2vec model
model_w2v.train(tokenized_tweet, total_examples= len(data_copy['clean_tweet']), epochs=100)

CPU times: user 4.06 s, sys: 382 ms, total: 4.45 s
Wall time: 4.17 s


(401883, 694700)

In [20]:
def find_vector(tokens, size):
    vector_ = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vector_ += model_w2v.wv[word].reshape((1, size)) ## vector for the word 1x200
            count += 1.
        except KeyError:  # token is not in vocabulary
            continue
    if count != 0: ## normalizing the vector
        vector_ /= count
    return vector_

In [21]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
### forming the dataframe of vectors representing the tweets
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = find_vector(tokenized_tweet.iloc[i], 200)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape
x_train,x_val,y_train,y_val = train_test_split(wordvec_df,y,shuffle=True,test_size=0.2)

#### Classification

In [22]:
classify(x_train,x_val,y_train,y_val)

time taken: 0.12934613227844238
Logistic regression Accuracy and F1 Score:
0.56
0.6206896551724138
time taken: 0.37081217765808105
SVM Accuracy and F1 Score:
0.57
0.6416666666666667
time taken: 0.46596598625183105
Random Forrest Accuracy and F1 Score:
0.565
0.6614785992217899


### Doc2vec Tokenizer

In [23]:
tqdm.pandas(desc="progress-bar") 
# label all the tweets
def add_label(tweet):
    output = []
    for i, s in zip(tweet.index, tweet):
        output.append(TaggedDocument(s, ["tweet_" + str(i)]))
    return output

labeled_tweets = add_label(tokenized_tweet) 

In [24]:
%%time
### defining model paramters
model_d2v = gensim.models.Doc2Vec(dm=1,dm_mean=1,vector_size=200,window=5, 
                                  negative=7,min_count=5,workers=32,alpha=0.1)

## forming vocabulary of words
model_d2v.build_vocab([i for i in tqdm(labeled_tweets)])

## training the doc2vec model
model_d2v.train(labeled_tweets, total_examples= len(data_copy['clean_tweet']), epochs=100)

100%|██████████| 1000/1000 [00:00<00:00, 2820648.29it/s]


CPU times: user 4.06 s, sys: 349 ms, total: 4.41 s
Wall time: 4.21 s


In [25]:
docvec_arrays = np.zeros((len(tokenized_tweet), 200)) 

### forming the dataframe of vectors representing the tweets
for i in range(len(x)):
    docvec_arrays[i,:] = model_d2v.dv[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape
x_train,x_val,y_train,y_val = train_test_split(docvec_df,y,shuffle=True,test_size=0.2)

### Classification

In [26]:
classify(x_train,x_val,y_train,y_val)

time taken: 0.045105934143066406
Logistic regression Accuracy and F1 Score:
0.635
0.6919831223628692
time taken: 0.30423951148986816
SVM Accuracy and F1 Score:
0.585
0.6914498141263941
time taken: 0.481886625289917
Random Forrest Accuracy and F1 Score:
0.545
0.6539923954372623


### Bert Tokenizer

In [27]:
def bert_preprocessing(text):
    #Removing Twitter Handles
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)
    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [28]:


# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Function to tokenize a set of texts
def bert_tokens(data):
    input_ids = []
    attention_masks = []

    for d_ in data:
        encoded_data = tokenizer.encode_plus(
            text=bert_preprocessing(d_),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,            
            pad_to_max_length=True,         # Pad sentence to max length
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_data.get('input_ids'))
        attention_masks.append(encoded_data.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [29]:
# Encoding the tweets
start = time.time()
encoded_tweets = [tokenizer.encode(data, add_special_tokens=True) for data in data_copy.tweet]
# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)
stop = time.time()
print('time taken:',stop-start)

Max length:  82
time taken: 0.3108561038970947


In [30]:
X_train, X_val, y_train, y_val =train_test_split(x, y, test_size=0.1, random_state=2020)

In [31]:
MAX_LEN = max_len
token_ids = list(bert_tokens([x[0]])[0].squeeze().numpy())
print('Tweet: ', x[0])
print('Token ids: ', token_ids)

## tokenizing data
train_inputs, train_masks = bert_tokens(X_train)
val_inputs, val_masks = bert_tokens(X_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tweet:  graduation wedding season work week runs thru weekends
Token ids:  [101, 7665, 5030, 2161, 2147, 2733, 3216, 27046, 13499, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Classification

In [32]:
## converting tensors to numpy for classification with ML algorithms
x_train_n = train_inputs.cpu().detach().numpy()
x_val_n = val_inputs.cpu().detach().numpy()

classify(x_train_n,x_val_n,y_train,y_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


time taken: 0.08398175239562988
Logistic regression Accuracy and F1 Score:
0.45
0.5925925925925926
time taken: 0.22464585304260254
SVM Accuracy and F1 Score:
0.44
0.6111111111111112
time taken: 0.16375088691711426
Random Forrest Accuracy and F1 Score:
0.46
0.5781249999999999


### Fine Tuned Bert Classifier

#### Data loaders

In [33]:
# Convert data to Tensors
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

batch_size = 32

# Training data dataloader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Validation data dataloader
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

#### Bert classifier class

In [34]:
%%time
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):

        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 50, 2 # Hidden size of BERT, hidden size of our classifier, and number of labels
        
        ## using pretrained bert model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        
        # defining classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        
        outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 48 µs, sys: 5 µs, total: 53 µs
Wall time: 57.5 µs


#### Function to initialize bert model

In [35]:
def create_model(epochs=4):
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)
    
    optimizer = AdamW(bert_classifier.parameters(),lr=5e-5,eps=1e-8  )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

#### Train and evaluation functions

In [36]:
loss_fn = nn.CrossEntropyLoss()
def set_seed(seed_value=4):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    print("Training started...\n")
    for epoch_i in range(epochs):
        
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            # Clip gradients to 1.0 to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            # Print the loss values after 50 batches, you can change this value
            if (step % 200 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            __, train_accuracy = evaluate(model, train_dataloader)

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
            print("Train Accuracy:",train_accuracy)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

#### Training the bert classifier

In [37]:
%%time
set_seed(4)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = create_model(epochs=5)
train(bert_classifier, train_dataloader, val_dataloader, epochs=5, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training started...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   28    |   0.677812   |     -      |     -     |   13.35  
----------------------------------------------------------------------
   1    |    -    |   0.677812   |  0.682677  |   55.47   |   17.06  
----------------------------------------------------------------------
Train Accuracy: 76.29310344827586


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   28    |   0.535442   |     -      |     -     |   10.03  
----------------------------------------------------------------------
   2    |    -    |   0.535442   |  0.741207  |   58.59   |   13.75  
----------------------------------------------------------------------
Train Accuracy: 87.06896551724138


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  

#### Prediction function for the BERT Classifier

In [38]:
def pred_fun(model, test_dataloader):
    model.eval()

    all_logits = []

    for batch in test_dataloader:
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

#### Calculating accuracy and F1 score

In [39]:
probs = pred_fun(bert_classifier, val_dataloader)
preds = probs[:,1] >= 0.4 # if prediction is greater than or equal to 0.4 than 1 else 0 
preds = preds.astype(int)
# Evaluate the Bert classifier
print('Accuracy and F1 score of bert classifier:')
print(accuracy_score(preds, y_val))
print(f1_score(preds, y_val))

Accuracy and F1 score of bert classifier:
0.66
0.6458333333333334
