# Train a sentiment model

## Get data

In [40]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import pandas as pd
import numpy as np
import csv
import warnings; warnings.filterwarnings("ignore")

np.random.seed(618)

file_path = "drive/MyDrive/Academics/MIT 6.7900/"

def whiteSpaceFix(sent: str) -> str:
    return " ".join(sent.split())

def getData(file_path: str, size: int):
    '''
    size: len(pos) == len(neg)
    return: list of str (pos, neg)
    '''

    tweets = []

    with open(file_path, 'r', encoding='latin-1') as f:
        reader = csv.reader(f)
        size_pos, size_neg = 0, 0
        
        for row in reader:
            if ((size_pos == size) and (size_neg == size)):
                return tweets
            
            if (row[0] == "2"):
                continue
            
            elif (row[0] == "0"):
                if (size_neg < size):
                    tweets.append((whiteSpaceFix(row[-1]), 0))
                    size_neg += 1
                else:
                    continue
            
            elif (row[0] == "4"):
                if (size_pos < size):
                    tweets.append((whiteSpaceFix(row[-1]), 1))
                    size_pos += 1
                else:
                    continue

def trainTestSplit(data, train_size = 0.8, val_size = 0.9, shuffle = True):
    '''
    data: a list of tuples
    train_size: proportion of data in training set
    val_size: validation proportion
    shuffle: if shuffled
    return: train, val, test data
    '''
    assert(val_size > train_size)

    if shuffle:
        np.random.shuffle(data)
    
    sent, label = zip(*data)
    train_sent, train_label = sent[:int(len(data) * train_size)], label[:int(len(data) * train_size)]
    val_sent, val_label = sent[int(len(data) * train_size):int(len(data) * val_size)], \
                          label[int(len(data) * train_size):int(len(data) * val_size)]
    test_sent, test_label = sent[int(len(data) * val_size):], \
                            label[int(len(data) * val_size):]

    return train_sent, train_label, val_sent, val_label, test_sent, test_label

# read data
tweets = getData(file_path + "twitter_labelled.csv", 20000)

# get train and test set
train_sent, train_label, val_sent, val_label, test_sent, test_label = trainTestSplit(tweets)

# print some examples
for i, sent in enumerate(train_sent):
    if i >= 10: break
    print(f"{sent}\n{train_label[i]}")

On my way 2 work. Raining here in k.c guess my k.c twiggs know that. Lol 8 days till I'm 30 wtf!
0
Almost done with Rant. Beautiful book. You guys should read it
1
Goodnight, can't wait for school in the morning!
1
@Mileycyrus i love jesus too and im getting ready for school (&lt;3)
1
@ben_mayer at least you get tan lines. i usually just burn
0
@Carm823 sure if you wanna drop them off at georgia sure
1
I'm getting ready to start another day of conference calls... woo hooo
1
@mnowluck idiot I am not @kima. I am @Mizohican Im in Mumbai only, where the heck are you?
1
@tasaljayyousi i have a test the second we get back btw, did you get tweetdeck?
0
@phillyan man ur picture is so small on my phone... I can't tell fropm that...give me a hint
1


## Word embedding

In [42]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 68.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 50.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [43]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [44]:
import torch

def tokenizer_function(data):
    '''
    data: list of str
    return: list of list of int
    '''

    tokenized_data = []

    for i, sent in enumerate(data):
        this_encoding = tokenizer.encode_plus(sent, truncation = True, pad_to_max_length = True,
                                              max_length = 64, return_attention_mask = True,
                                              return_tensors = 'pt')
        tokenized_data.append(this_encoding["input_ids"])
    
    tokenized_data = torch.cat(tokenized_data, dim = 0)

    return tokenized_data.numpy()

# tokenize train and test data
train_sent_tk = tokenizer_function(train_sent)
val_sent_tk = tokenizer_function(val_sent)
test_sent_tk = tokenizer_function(test_sent)

## Train a simple logistic regression

In [45]:
from sklearn.linear_model import LogisticRegression

def logisticRegression(train_X, train_y, test_X, test_y):
    '''
    standard logistic regression
    return: model, train score, test score
    '''
    # solver and penalty do not make much difference on results!
    model = LogisticRegression(penalty = "l1", solver = "saga").fit(train_X, train_y)

    return model, model.score(train_X, train_y), model.score(test_X, test_y)

# train data
model, train_score, test_score = logisticRegression(train_sent_tk,
                                                    train_label,
                                                    test_sent_tk,
                                                    test_label)

# only slightly better than random guessing!
print(f"train score is {round(train_score, 2)}, test score is {round(test_score, 2)}")

train score is 0.57, test score is 0.56


## Leverage DistilBertForSequenceClassification

In [46]:
from transformers import DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [47]:
# prepare data
def prepareData(sent, label):
    return TensorDataset(torch.tensor(sent), torch.tensor(label))

train_data = prepareData(train_sent_tk, train_label)
val_data = prepareData(val_sent_tk, val_label)
test_data = prepareData(test_sent_tk, test_label)

train_dataloader = DataLoader(train_data, batch_size = 16, shuffle = True, drop_last = True)
val_dataloader = DataLoader(val_data, batch_size = 16, shuffle = True, drop_last = True)
test_dataloader = DataLoader(test_data, batch_size = 16, shuffle = True, drop_last = True)

In [121]:
def train_loop(dataloader, model, optimizer, print_size = 10):
    '''

    '''
    train_loss = []
    model.train()

    for i, (sent, label) in enumerate(dataloader):
        sent = sent.to(device)
        label = label.to(device)

        model.zero_grad()

        pred = model(input_ids = sent, labels = label)
        loss = pred.loss
        train_loss.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        if (i % print_size == 0):
            print(f"loss = {round(loss.item(), 3)}")
    
    return train_loss

def val_loop(dataloader, model):
    '''

    '''
    val_loss, val_acc, size = 0, 0, len(dataloader.dataset)

    model.eval() # put model to evaluation mode

    for i, (sent, label) in enumerate(dataloader):
        sent = sent.to(device)
        label = label.to(device)
        with torch.no_grad():
            pred = model(sent, labels = label)
        
        loss, logits = pred.loss, pred.logits
        val_loss += loss.item()
        val_acc += sum(logits.argmax(dim = 1) == label).item()
    
    return val_loss / len(dataloader), val_acc / size

In [122]:
from torch import optim
from transformers import DistilBertForSequenceClassification, AdamW
from tqdm import tqdm
torch.manual_seed(618)

lr = 2e-5
epochs = 3
eps = 1e-8

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', 
                                                            num_labels=2, output_attentions = False, 
                                                            output_hidden_states=False)
model.to(device)
optimizer = AdamW(model.parameters(), lr = lr, eps = eps)

train_loss_arr, val_loss_arr, val_acc_arr = [], [], []
for epoch in tqdm(range(epochs)):
    print(f"training epoch {epoch+1}")
    train_loss = train_loop(train_dataloader, model, optimizer, 100)
    print(f"running validation {epoch+1}")
    val_loss, val_acc = val_loop(val_dataloader, model)
    val_loss_arr.append(val_loss)
    val_acc_arr.append(val_acc)
    print(f"validation loss = {round(val_loss, 3)}")
    train_loss_arr.extend(train_loss)
print("Done!")

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.b

training epoch 1
loss = 0.685
loss = 0.691
loss = 0.434
loss = 0.654
loss = 0.614
loss = 0.423
loss = 0.318
loss = 0.454
loss = 0.377
loss = 0.523
loss = 0.533
loss = 0.515
loss = 0.448
loss = 0.351
loss = 0.401
loss = 0.257
loss = 0.53
loss = 0.506
loss = 0.526
loss = 0.43
running validation 1


 33%|███▎      | 1/3 [03:33<07:06, 213.01s/it]

validation loss = 0.412
training epoch 2
loss = 0.156
loss = 0.321
loss = 0.244
loss = 0.234
loss = 0.226
loss = 0.31
loss = 0.408
loss = 0.148
loss = 0.486
loss = 0.12
loss = 0.359
loss = 0.688
loss = 0.253
loss = 0.261
loss = 0.179
loss = 0.361
loss = 0.29
loss = 0.488
loss = 0.266
loss = 0.348
running validation 2


 67%|██████▋   | 2/3 [07:18<03:40, 220.40s/it]

validation loss = 0.425
training epoch 3
loss = 0.203
loss = 0.012
loss = 0.106
loss = 0.111
loss = 0.038
loss = 0.416
loss = 0.015
loss = 0.355
loss = 0.232
loss = 0.097
loss = 0.189
loss = 0.031
loss = 0.326
loss = 0.204
loss = 0.141
loss = 0.163
loss = 0.409
loss = 0.419
loss = 0.036
loss = 0.203
running validation 3


100%|██████████| 3/3 [11:06<00:00, 222.14s/it]

validation loss = 0.506
Done!





In [123]:
# test

def test_loop(dataloader, model):
    '''

    '''
    test_loss, test_acc, size = 0, 0, len(dataloader.dataset)
    total_pred = []

    model.eval() # put model to evaluation mode

    for i, (sent, label) in enumerate(dataloader):
        sent = sent.to(device)
        label = label.to(device)
        with torch.no_grad():
            pred = model(sent, labels = label)
        
        loss, logits = pred.loss, pred.logits
        test_loss += loss.item()
        test_acc += sum(logits.argmax(dim = 1) == label).item()
        total_pred.extend(logits.detach().cpu().numpy())
    
    return test_loss / len(dataloader), test_acc / size, total_pred

test_loss, test_acc, test_pred = test_loop(test_dataloader, model)

print(f"test accuracy is {round(test_acc, 3)}")

test accuracy is 0.81


# Apply model on our data

## Load data

In [51]:
df = pd.read_excel(file_path + "elections_with_tweets.xlsx")
df = df[["Case Number", "Votes for Labor Union1", "Votes Against", 
         "Tweets - Union", "Tweets - Labor Org", "Tweets - Case Name"]]
df.fillna(0, inplace = True)

for col in ["Score_Union", "Score_LabOrg", "Score_Name"]:
    df[col] = np.nan

df.head()

Unnamed: 0,Case Number,Votes for Labor Union1,Votes Against,Tweets - Union,Tweets - Labor Org,Tweets - Case Name,Score_Union,Score_LabOrg,Score_Name
0,01-RC-090869,74.0,45.0,['@notch Preprocess all the meshes into a unio...,0,"['Volunteers, food needed for Thanksgiving - b...",,,
1,10-RC-090329,106.0,83.0,"[""I'm at eCO Credit Union (Alabaster, AL) 4sq....",0,0,,,
2,18-RC-090794,36.0,78.0,0,0,0,,,
3,29-RD-091658,114.0,107.0,"[""I'm at Grand Union Hotel (New York, NY) 4sq....",0,0,,,
4,09-RC-090819,28.0,62.0,['@t_marsh83 @frontmantrue @fredthompson agree...,0,0,,,


## Calculate scores

\begin{equation}
    Score = \begin{cases}\frac{\#\text{ positive tweets}}{\#\text{ total tweets}}, &\text{if # total tweets} > 0 \\
    NaN, &\text{otherwise}\end{cases}
\end{equation}

In [52]:
def parseString(tweets):
    tweets = tweets.replace("\",", "\',")
    tweets_split = tweets.split("\',")
    tweets_parsed = [tweet[2:] for tweet in tweets_split]
    tweets_parsed[-1] = tweets_parsed[-1][:-2]
    return tweets_parsed

def calcScore(data, model):
    '''
    data: NaN or list of str
    '''
    if data == 0:
        return np.nan
    
    data = parseString(data)

    data = [whiteSpaceFix(dat) for dat in data]

    data_tk = tokenizer_function(data)

    data_loader = DataLoader(torch.tensor(data_tk), batch_size = 16, shuffle = False)

    model.eval()

    total_pred = []

    for i, sent in enumerate(data_loader):
        sent = sent.to(device)
        with torch.no_grad():
            pred = model(sent)
        
        logits = pred.logits
        total_pred.extend(logits.detach().cpu().numpy())
    
    total_pred = [np.argmax(i) for i in total_pred]
    
    return sum(total_pred) / len(total_pred)

calcScore(df.iloc[0, 3], model)

0.4

In [53]:
for i in range(len(df)):
    if (i % 100 == 0):
        print(".", end = "")
    tweets_union, tweets_laborg, tweets_name = df.iloc[i, 3], df.iloc[i, 4], df.iloc[i, 5]
    df.iloc[i, 6] = calcScore(tweets_union, model)
    df.iloc[i, 7] = calcScore(tweets_laborg, model)
    df.iloc[i, 8] = calcScore(tweets_name, model)

..................

In [120]:
vote_for = df["Votes for Labor Union1"] > df["Votes Against"]
score_for = df[["Score_Union", "Score_LabOrg", "Score_Name"]].mean(axis = 1)
# score_for = df["Score_Name"]
compare = {"vote_for": list(vote_for), "score_for": score_for}
compare = pd.DataFrame(compare)
compare.dropna(inplace = True)

compare["pred"] = compare["score_for"] > 0.5
sum(compare.pred == compare.vote_for) / len(compare)

0.5596491228070175