<a href="https://colab.research.google.com/github/renuchaurasia/renu_INFO5731_Spring2023/blob/main/project_info5731/INFO_5731_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Country Image Evaluation by Aspect Based Sentiment Analysis


In [None]:
# installing libraries
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Imports

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer

from collections import OrderedDict


In [None]:
# Load dataset into a Pandas DataFrame

df_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/cleaned_train_text.csv", header=0)
df_predict = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/cleaned_predict_text.csv", header=0)

# df_train.drop(columns=['Unnamed: 0'], inplace=True)
# df_predict.drop(columns=['Unnamed: 0'], inplace=True)


df_train.shape, df_predict.shape

((193, 10), (134201, 6))

In [None]:
df_train.head()

Unnamed: 0,Full Text,Sentiment,Domain,Date,Page Type,Country Character,Country Competence,People Character,People Competence,text
0,RT @StephenMcDonell #China’s Henan province to...,negative,twitter.com,59:36.0,twitter,0.0,0.0,-1.0,0.0,china s henan province today everywhere we re ...
1,RT @Bsintash #VirusThreatInCamps We call all i...,neutral,twitter.com,58:18.0,twitter,-1.0,0.0,0.0,0.0,virusthreatincamps call international media ou...
2,RT @RT_com #Spain confirms first case of fast-...,neutral,twitter.com,57:39.0,twitter,0.0,0.0,0.0,-1.0,spain confirms first case fastspreading corona...
3,@kerrya11en That is amazing and cute.I am 100%...,positive,twitter.com,56:54.0,twitter,0.0,0.0,1.0,0.0,amazing cutei supporting guys hope fight virus...
4,RT @VistaPResearch United joins Delta and Amer...,neutral,twitter.com,56:46.0,twitter,0.0,-1.0,0.0,0.0,united joins delta american halting flights ch...


In [None]:
df_predict.head()

Unnamed: 0,Full Text,Sentiment,Domain,Date,Page Type,text
0,AMANPURI EXCHANGE has the highest referral fee...,neutral,twitter.com,00:23.0,twitter,amanpuri exchange highest referral fee industr...
1,RT @SenRubioPress While the #coronavirus sprea...,neutral,twitter.com,00:22.0,twitter,coronavirus spreads must continue take necessa...
2,RT @cjwerleman “Our only sin was having accept...,neutral,twitter.com,00:14.0,twitter,our sin accepted islam we re religious family ...
3,RT @le_french_mulu For the people who want to ...,negative,twitter.com,00:14.0,twitter,people want know like empty mall china coronav...
4,RT @SenRubioPress Confirmed #coronavirus cases...,neutral,twitter.com,00:04.0,twitter,confirmed coronavirus cases xinjiang highlight...


In [None]:
predict_df = pd.DataFrame(df_predict['text'])
predict_df.head()

Unnamed: 0,text
0,amanpuri exchange highest referral fee industr...
1,coronavirus spreads must continue take necessa...
2,our sin accepted islam we re religious family ...
3,people want know like empty mall china coronav...
4,confirmed coronavirus cases xinjiang highlight...


In [None]:
predict_df.columns

Index(['text'], dtype='object')

In [None]:
df_train.columns

Index(['Full Text', 'Sentiment', 'Domain', 'Date', 'Page Type',
       'Country Character', 'Country Competence', 'People Character',
       'People Competence', 'text'],
      dtype='object')

In [None]:
# data['text'] = data['text_modified']

# data.drop(columns=['Unnamed: 0', 'Full Text', 'Sentiment', 'text_modified'], inplace=True)
# data.columns
# data

# Implementation

In [None]:
# Generating input_ids and attention_mask for the BERT transformer

class AspectDetectionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, 'text']
        aspects = self.data.loc[idx, ['Country Character', 'Country Competence', 'People Character', 'People Competence']].values
        tokens = self.tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
        return {'input_ids': tokens['input_ids'].squeeze(0), 'attention_mask': tokens['attention_mask'].squeeze(0), 'aspects': aspects}


In [None]:
# Using Pre-trained BERT model for Sentiment detection for the predefined aspects

class AspectDetectionModel(nn.Module):
    def __init__(self, input_size, num_aspects):
        super(AspectDetectionModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(input_size, num_aspects)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # Pass input through BERT encoder
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        tweet_representation = outputs.last_hidden_state.mean(dim=1)

        # Pass tweet representation through linear layer and sigmoid activation function
        out = self.linear(tweet_representation)
        out = self.sigmoid(out)

        return out

In [None]:
# Define training parameters
num_epochs = 5
batch_size = 32
learning_rate = 0.0001

# Parameters
MAX_LEN = 128
BATCH_SIZE = 32

In [None]:
# Tokenize dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = AspectDetectionDataset(df_train, tokenizer)

In [None]:
# Create data loader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Create an instance of the model
model = AspectDetectionModel(input_size=768, num_aspects=4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Model training
def train(model, train_loader, num_epochs, optimizer, criterion):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Print running loss
            running_loss += loss.item()
            if (i+1) % 10 == 0:
                batch_loss = running_loss / 10
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}], Loss: {batch_loss:.4f}")
                running_loss = 0.0

    print('Finished training')

    return model

In [None]:
# Prediction
def predict(model, data_loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.append(outputs.cpu().numpy())

    return np.concatenate(preds, axis=0)


In [None]:
# Create Data Loder

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = AspectDetectionDataset(
        texts=df['text'].to_numpy(),
        aspects=df['aspect'].to_numpy(),
        labels=df['label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4
    )


In [None]:
# Dataframe to store the results
df_result = pd.DataFrame(columns=['Country Character', 'Country Competence', 'People Character', 'People Competence', 'Tweet'])


In [None]:
df_result.head()

Unnamed: 0,Country Character,Country Competence,People Character,People Competence,Tweet


In [None]:
# result_dict = {'Country Character': [],
#               'Country Competence': [],
#               'People Character': [],
#               'People Competence': []
#               }

In [None]:
#  Function to make predictions on a single tweet

label_names = ['Country Character', 'Country Competence', 'People Character', 'People Competence']

def predict_sentiment(tweet_text,max_length=64):

    # Tokenize the tweet text and add the special tokens
    encoded_tweet = tokenizer.encode_plus(
        tweet_text,                      
        add_special_tokens=True,        
        max_length=max_length,          
        padding='max_length',           
        truncation=True,                
        return_attention_mask=True,     
        return_tensors='pt',            
    )

    input_ids = encoded_tweet['input_ids']
    attention_mask = encoded_tweet['attention_mask']

    # Make a forward pass to get the model's output logits
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Convert the output logits to probabilities
    probabilities = torch.softmax(outputs, dim=1)

    # Get the predicted sentiment labels
    predicted_labels = [label_names[i] for i in torch.topk(probabilities, k=4).indices.tolist()[0]]
    # Coverting probailities from tensor datatype to numpy
    probabilities = probabilities.numpy()


    # Formating the output as a single row to appened it to the result Dataframe
    one_row = {
        predicted_labels[0]:probabilities[0][0],
        predicted_labels[1]:probabilities[0][1],
        predicted_labels[2]:probabilities[0][2],
        predicted_labels[3]:probabilities[0][3]
        # 'Tweet' : tweet_text  
    }
    # Sorting the record based on Key Values
    od_row = OrderedDict(sorted(one_row.items()))
    # Getting row values in list format
    row_values = list(od_row.values())

    # Converting probabilites into sentiments
    row_sentiments = list(map(lambda x: -1 if x <= 0.33 else (0 if (x > 0.33 and x <= 0.66) else 1), row_values))
    # Appending tweet_text to row_sentiments 
    row_sentiments.append(tweet_text)

    # Appending row to result Dataframe
    df_result.loc[len(df_result.index)] = row_sentiments 

    return 0

In [None]:
# Calling predict function for every tweet

_ = predict_df['text'][:1000].apply(predict_sentiment)


# for i in range(0, predict_df.shape[0], 1000):
#   start_index = df_result2.shape[0]
  # df_result2 = predict_df['text'][start_index:i].apply(predict_sentiment)

In [None]:
df_result.head()

Unnamed: 0,Country Character,Country Competence,People Character,People Competence,Tweet
0,-1,-1,-1,-1,amanpuri exchange highest referral fee industr...
1,-1,-1,-1,-1,coronavirus spreads must continue take necessa...
2,-1,-1,-1,-1,our sin accepted islam we re religious family ...
3,-1,-1,-1,-1,people want know like empty mall china coronav...
4,-1,-1,-1,-1,confirmed coronavirus cases xinjiang highlight...


In [None]:
df_predict.head()

Unnamed: 0,Full Text,Sentiment,Domain,Date,Page Type,text
0,AMANPURI EXCHANGE has the highest referral fee...,neutral,twitter.com,00:23.0,twitter,amanpuri exchange highest referral fee industr...
1,RT @SenRubioPress While the #coronavirus sprea...,neutral,twitter.com,00:22.0,twitter,coronavirus spreads must continue take necessa...
2,RT @cjwerleman “Our only sin was having accept...,neutral,twitter.com,00:14.0,twitter,our sin accepted islam we re religious family ...
3,RT @le_french_mulu For the people who want to ...,negative,twitter.com,00:14.0,twitter,people want know like empty mall china coronav...
4,RT @SenRubioPress Confirmed #coronavirus cases...,neutral,twitter.com,00:04.0,twitter,confirmed coronavirus cases xinjiang highlight...


In [None]:
df_result['Domain'] = df_predict[:1000]['Domain']
df_result['Date'] = df_predict[:1000]['Date']
df_result['Page Type'] = df_predict[:1000]['Page Type']
df_result['Sentiment'] = df_predict[:1000]['Sentiment']


df_result.shape

(1000, 9)

In [None]:
df_result.head()

Unnamed: 0,Country Character,Country Competence,People Character,People Competence,Tweet,Domain,Date,Page Type,Sentiment
0,-1,-1,-1,-1,amanpuri exchange highest referral fee industr...,twitter.com,00:23.0,twitter,neutral
1,-1,-1,-1,-1,coronavirus spreads must continue take necessa...,twitter.com,00:22.0,twitter,neutral
2,-1,-1,-1,-1,our sin accepted islam we re religious family ...,twitter.com,00:14.0,twitter,neutral
3,-1,-1,-1,-1,people want know like empty mall china coronav...,twitter.com,00:14.0,twitter,negative
4,-1,-1,-1,-1,confirmed coronavirus cases xinjiang highlight...,twitter.com,00:04.0,twitter,neutral


In [None]:
# Generating output csv file
df_result.to_csv("/content/drive/MyDrive/Colab Notebooks/final_csv.csv", index=False, 
                 columns=['Date', 'Domain', 'Page Type', 'Tweet', 'Country Character', 'Country Competence', 'People Character', 'People Competence', 'Sentiment'])
