# Word Embedding

## Get data

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np
import csv
import warnings; warnings.filterwarnings("ignore")

np.random.seed(618)

file_path = "drive/MyDrive/Academics/MIT 6.7900/"

df = pd.read_excel(file_path + "elections_with_cleaned_tweets.xlsx")
df = df[["Case Number", "Votes for Labor Union1", "Votes Against", 
         "Tweets - Union", "Tweets - Labor Org", "Tweets - Case Name"]]
df.fillna(0, inplace = True)
df = df[df["Tweets - Case Name"] != 0]

df["Outcome"] = df["Votes for Labor Union1"] > df["Votes Against"]

df.head()

Unnamed: 0,Case Number,Votes for Labor Union1,Votes Against,Tweets - Union,Tweets - Labor Org,Tweets - Case Name,Outcome
0,01-RC-090869,74.0,45.0,[' Preprocess all the meshes into a union of c...,0,"['Volunteers, food needed for Thanksgiving - b...",True
7,28-RC-092433,88.0,56.0,"[""I'm at UNLV Student Union (Las Vegas, NV) 4s...",0,"[""Antes de salir a los Latin Grammy! Con todo ...",True
9,21-RC-092165,48.0,90.0,['Union © 🚆 🇺🇸 @ Union Station instagr.am/p/Sj...,0,['🏥 @ Chapman Medical Center instagr.am/p/RKN...,False
12,13-RC-091872,97.0,59.0,"[' "" ESPN SC NFL - Paul Tagliabue, lawyers for...",0,['chicago jobs Group Worker Aide - YMCA of Met...,True
15,03-RC-090207,37.0,60.0,"[""At End of the World Get together on using so...",0,"[""I'm at Major League Lacrosse (Boston, MA) 4s...",False


In [9]:
# train val test split
from sklearn.utils import shuffle

def trainValTestSplit(df, train_size = 0.8, val_size = 0.9):
    assert(val_size > train_size)
    l = len(df)
    df = shuffle(df)
    df_train, df_val, df_test = df.iloc[:int(l * train_size),:], \
                                df.iloc[int(l * train_size):int(l * val_size), :], \
                                df.iloc[int(l * val_size):, :]
    return df_train, df_val, df_test

df_train, df_val, df_test = trainValTestSplit(df)

## Embedding with BERT

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 13.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 66.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [5]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [10]:
import torch

def tokenizerFunction(data):
    '''
    data: list of str
    return: list of list of int
    '''

    tokenized_data = []

    for i, sent in enumerate(data):
        this_encoding = tokenizer.encode_plus(sent, truncation = True, pad_to_max_length = True,
                                              max_length = 64, return_attention_mask = True,
                                              return_tensors = 'pt')
        tokenized_data.append(this_encoding["input_ids"])
    
    tokenized_data = torch.cat(tokenized_data, dim = 0)

    return tokenized_data.numpy()

In [11]:
def whiteSpaceFix(sent: str) -> str:
    return " ".join(sent.split())

def parseString(tweets):
    '''
    return: list of lists (tweets)
    '''
    tweets = tweets.replace("\",", "\',")
    tweets_split = tweets.split("\',")
    tweets_parsed = [tweet[2:] for tweet in tweets_split]
    tweets_parsed[-1] = tweets_parsed[-1][:-2]
    tweets_parsed = [whiteSpaceFix(tweet) for tweet in tweets_parsed]
    return tweets_parsed

def tokenizeDataset(df):
    '''
    df: pandas DataFrame
    return: numpy array of all tokenized tweets, 
            numpy array of labels (P/F),
            numpy array of length
    '''
    df_tk = tokenizerFunction(parseString(df.iloc[0,5]))
    label = np.array([df.iloc[0,6]] * df_tk.shape[0])
    election = np.zeros(len(df))
    election[0] = len(label)
    
    for i in range(1, len(df)):
        if df.iloc[i,5] == 0:
            continue
        
        df_tk_local = tokenizerFunction(parseString(df.iloc[i,5]))
        df_tk = np.vstack((df_tk, df_tk_local))
        
        label_local = np.array([df.iloc[i,6]] * df_tk_local.shape[0])
        label = np.hstack((label, label_local))
        
        election[i] = len(label_local)

    return df_tk, label, election

df_train_tk, train_label, train_election = tokenizeDataset(df_train)
df_val_tk, val_label, val_election = tokenizeDataset(df_val)
df_test_tk, test_label, test_election = tokenizeDataset(df_test)

# Baseline: Gaussian Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

model_NB = GaussianNB()
model_NB.fit(df_train_tk, train_label)

GaussianNB()

In [13]:
def calcMetrics(y_pred, y_true):
    '''
    return: accuracy, f1 score
    '''
    acc = len(y_pred[y_pred == y_true]) / len(y_pred)
    f1 = f1_score(y_true, y_pred)

    return acc, f1

## Method 1: Evaluate each tweet then vote by simple majority

In [14]:
def evaluateEach(label_pred, label_true, election):
    '''
    label_pred: predicted array of labels (P/F)
    label_true: true array of labels (P/F)
    election: # of tweets per election
    return: accuracy in terms of election, f1 score
    '''
    election = election.astype(int)
    election_label_pred, election_label_true = [], []

    for i in range(len(election)):
        if election[i] == 0:
            continue

        start, end = 0, election[i]
        if i != 0:
            start, end = sum(election[:i]), sum(election[:i+1])
        
        local_label_pred = label_pred[start:end]
        local_label_true = label_true[start:end]

        election_label_true.append(local_label_true[0])

        local_election_label_pred = False
        if len(local_label_pred[local_label_pred]) > len(local_label_pred[~local_label_pred]):
            local_election_label_pred = True
        election_label_pred.append(local_election_label_pred)
    
    assert len(election_label_pred) == len(election_label_true)

    election_label_pred = np.array(election_label_pred)
    election_label_true = np.array(election_label_true)

    acc, f1 = calcMetrics(election_label_pred, election_label_true)

    return acc, f1

In [15]:
val_label_pred = model_NB.predict(df_val_tk)
evaluateEach(val_label_pred, val_label, val_election)

(0.42857142857142855, 0.5294117647058824)

## Method 2: Evalute by vector mean of each election

In [16]:
def evaluateMean(df_tk, label_true, election, model):
    '''
    df_tk: tokenized tweets
    label_true: true label
    election: position
    model: fitted model
    '''
    election = election.astype(int)
    election_label_pred, election_label_true = [], []

    for i in range(len(election)):
        if election[i] == 0:
            continue

        start, end = 0, election[i]
        if i != 0:
            start, end = sum(election[:i]), sum(election[:i+1])
        
        local_df_tk = df_tk[start:end, :]
        local_label_true = label_true[start:end]

        election_label_true.append(local_label_true[0])

        local_df_tk_mean = local_df_tk.mean(axis = 0)
        local_label_pred = model.predict(np.array([local_df_tk_mean]))
        election_label_pred.extend(local_label_pred)

    assert len(election_label_pred) == len(election_label_true)

    election_label_pred = np.array(election_label_pred)
    election_label_true = np.array(election_label_true)

    acc, f1 = calcMetrics(election_label_pred, election_label_true)

    return acc, f1

In [17]:
evaluateMean(df_val_tk, val_label, val_election, model_NB)

(0.32142857142857145, 0.3870967741935484)

## Compare: randomly generate outcome predictions

In [18]:
def evaluateRandom(label_true, election):
    election = election.astype(int)
    election_label_true = []

    for i in range(len(election)):
        if election[i] == 0:
            continue

        start, end = 0, election[i]
        if i != 0:
            start, end = sum(election[:i]), sum(election[:i+1])

        local_label_true = label_true[start:end]

        election_label_true.append(local_label_true[0])
    
    election_label_pred = np.random.randint(0, 2, len(election_label_true))

    return calcMetrics(election_label_pred, election_label_true)

In [20]:
acc, f1 = 0, 0
for i in range(300):
    local_acc, local_f1 = evaluateRandom(val_label, val_election)
    acc += local_acc
    f1 += local_f1
print(acc / 300)
print(f1 / 300)

0.5004761904761905
0.6062037585520911
