In [None]:
# ! pip install transformers

In [1]:
import glob
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)



In [4]:
def getSentimentScores(text):

    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
#     scores = {}
    sentiments = {}
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")
        sentiments[l] = np.round(float(s), 4)
    return sentiments

    

In [5]:
path = "./bangalore/*.csv"

In [6]:
fileList = sorted(glob.glob(path))

In [25]:
offset = 319
for jdx, file in enumerate(fileList[offset:]):
    sentiments = []
    df = pd.read_csv(file)
    for idx, tweet in enumerate(df["tweet"]):
        clear_output(wait=True)
    #     print(df["index"])
        print(f"File #{offset+jdx+1}/{len(fileList)}; Tweet #{idx+1}/{df.shape[0]}")
        try:
            sentiments.append(getSentimentScores(tweet))
        except:
            pass
    sent_df = df.join(pd.DataFrame(sentiments))
    sent_df.to_csv(f"{file.replace('bangalore', 'bangalore-sentiment')}", index=False)

File #1339/1339; Tweet #99/99
