# Tweet Sentiment Classification
Using pretrained transformer from Hugging Face: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

In [None]:
! pip install transformers &> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import time
import glob
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import trange

import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"device: {device}")

import gc # garbage collector
import warnings
warnings.filterwarnings('ignore')

device: cuda


In [None]:
# set working dir
# cwd = "drive/MyDrive/Topic Modeling" # Ximin
cwd = "drive/Shareddrives/AV_Twitter_Project/Topic Modeling" # Ran

In [None]:
# import sentiment model
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
def remove_tags(df):
    """ remove tweet-processor lib tags """
    # python does not allow $XXX$ symbols
    df = df.str.replace("$", "")
    df = df.str.replace("URL", "")
    df = df.str.replace("HASHTAG", "")
    df = df.str.replace("MENTION", "")
    df = df.str.replace("RESERVED", "")
    df = df.str.replace("EMOJI", "")
    df = df.str.replace("SMILEY", "")
    df = df.str.replace("NUMBER", "")
    return df

def inference(df, batch_size=32, debug=False):
    df = df.reset_index(drop=True)
    text = remove_tags(df["tokens"].astype(str)).values.tolist()
    
    num_batch = np.ceil(len(text) / batch_size).astype(int)
    probs = []
    for i in range(num_batch):
        data_batch = text[i * batch_size:min((i + 1) * batch_size, len(text))]
        encoded_input = tokenizer(data_batch, padding=True, return_tensors="pt")
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        
        output = model(**encoded_input)
        probs_batch = torch.softmax(output[0], dim=-1).data.cpu().numpy()
        probs.append(probs_batch)
        
    probs = np.concatenate(probs, axis=0)
    
    df_sentiment = pd.DataFrame(probs, columns=["negative", "neutral", "positive"])
    df = pd.concat([df, df_sentiment], axis=1)

    if debug:
        df = df[["tweet_id", "tokens", "negative", "neutral", "positive"]]
    else:
        df = df[["tweet_id", "negative", "neutral", "positive"]]
    return df

In [None]:
# batch inference
debug = False
batch_size = 32
data_paths = glob.glob(os.path.join(cwd, "data/opinion_modeling/raw_csv/", "*.csv"))
save_path = os.path.join(cwd, "data/opinion_modeling/sentiment/")
if not os.path.exists(save_path):
    os.mkdir(save_path)

start_time = time.time()
for i, data_path in enumerate(data_paths):
    filename = os.path.basename(data_path)
    df = pd.read_csv(data_path, lineterminator="\n")
    if debug:
        df = df.iloc[:100]

    print(f"{i}, {filename}, size: {df.shape}, time: {time.time() - start_time:.2f}")

    df = inference(df, batch_size, debug)
    df.to_csv(os.path.join(save_path, filename), index=False)

    if debug:
        break

print(f"done, time: {time.time() - start_time:.2f}")

0, confused_tesla_incident_in_february.csv, size: (67689, 15)
1, mcity_driverless_shuttle_launches_on_um_north_campus.csv, size: (85512, 15)
2, las_vegas_autonomous_shuttle_crash.csv, size: (44651, 15)
3, introduction_of_tesla_autopilot.csv, size: (32521, 15)
4, world_first_production_car_to_offer_level_3_automation.csv, size: (43103, 15)
5, google_lexus_suv.csv, size: (51748, 15)
6, tesla_model_x_fatality.csv, size: (129713, 15)
7, march_1st_2019_highway_crash_fatality.csv, size: (93357, 15)
8, tesla_model_s_fatal_crash_in_china.csv, size: (36941, 15)
9, uber_self_driving_volvo_crash.csv, size: (117840, 15)
10, university_of_michigan_mcity.csv, size: (42768, 15)
11, first_automated_vehicle_crash_fatality_takes_place.csv, size: (92817, 15)
done, time: 5102.16
