In [None]:
!pip install tensorflow
!pip install tokenizers
!pip install transformers

In [1]:
import collections
from collections import defaultdict
import csv
import functools
import itertools
import re
import string
import timeit
from timeit import default_timer
import urllib.request

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModel, TFAutoModel

from utils import *
from topic_utils import *

### Data

In [2]:
users = UsersData('data/users')
tweets = TweetsData('data/tweets')

In [3]:
df = tweets.df.loc[:][:]
user_df = users.df.loc[:][:]

In [4]:
campaign = df[df['campaign'] == 'iran202012']
campaign_users = user_df[user_df['campaign'] == 'iran202012']

# Sentiment Analysis

Download the Cardiff NLP Twitter pre-trained roBERTa base model:

In [5]:
task = 'sentiment'

MODEL = "cardiffnlp/twitter-roberta-base-{}".format(task)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_tokens(["[HTAG]", "[URL]", "[AT]"])

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


3

In [6]:
# label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"

with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')

labels = [row[1] for row in csvreader if len(row) > 1]
labels

['negative', 'neutral', 'positive']

Choose level of analysis: here we will test adding a sentiment label to individual tweets.

In [9]:
string_agg = lambda x: list(x)

user_tweets = (campaign
               .groupby(['userid'])[['tweet_text']]
               .agg(string_agg)
              )

In [10]:
documents = (user_tweets
             .loc[:]['tweet_text']
            )

In [11]:
test = documents[0][:10]
test

['America and the way its government treats its people must be reformed https://t.co/DupN3cE2dd',
 'https://t.co/ee5KZJMm5R',
 'Portland Police Bureau is preparing for a variety of mass gathering events being planned for Saturday, September 26, 2020. At this time, one group has been announced they will hold an event at Delta Park at noon.  #PoliceLivesMatter  #Polizeigewalt  #Portland  #PortlandProtests https://t.co/3JSoa0UcOL',
 'Why lies ..... why hypocrisy .... why security forces .... why racism ..... why cut budgets .... to deceive people    No Trump       No No   No #ARMY  #COVIDー19  #Biden #Trump https://t.co/v1WYXyidzJ',
 'Difference #Biden https://t.co/Vl5lB7q2l6',
 'He has destroyed every busines......🐂🐃🐃🐃🐃🐃 #TrumpMeltdown https://t.co/tKrPEdujCZ',
 'Even if the power to negotiate deals is handed over to the presidency, Congress will still have the final say, and judging by the current political climate, Johnson will not fire him this time #Trump  #Covid_19  #DemocracyDay htt

In [12]:
def get_sentiment(documents):
    
    sentiments = []

    for doc in documents:
        encoded_input = tokenizer(doc, return_tensors='tf')
        output = model(encoded_input)
        scores = output[0][0].numpy()
        scores = softmax(scores)

        sentiments.append(labels[np.argmax(scores)])

    return dict(zip(documents, sentiments))

In [13]:
with_sentiment = get_sentiment(test)
with_sentiment

{'America and the way its government treats its people must be reformed https://t.co/DupN3cE2dd': 'negative',
 'https://t.co/ee5KZJMm5R': 'neutral',
 'Portland Police Bureau is preparing for a variety of mass gathering events being planned for Saturday, September 26, 2020. At this time, one group has been announced they will hold an event at Delta Park at noon.  #PoliceLivesMatter  #Polizeigewalt  #Portland  #PortlandProtests https://t.co/3JSoa0UcOL': 'neutral',
 'Why lies ..... why hypocrisy .... why security forces .... why racism ..... why cut budgets .... to deceive people    No Trump       No No   No #ARMY  #COVIDー19  #Biden #Trump https://t.co/v1WYXyidzJ': 'negative',
 'Difference #Biden https://t.co/Vl5lB7q2l6': 'neutral',
 'He has destroyed every busines......🐂🐃🐃🐃🐃🐃 #TrumpMeltdown https://t.co/tKrPEdujCZ': 'negative',
 'Even if the power to negotiate deals is handed over to the presidency, Congress will still have the final say, and judging by the current political climate, Joh

Find percent positive/negative/neutral for all campaign tweets:

In [30]:
def get_sentiment_apply(document):
    """Usage: df.apply(get_sentiment_apply)"""
    encoded_input = tokenizer(document, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)

    return labels[np.argmax(scores)]#, np.max(scores)

In [37]:
t1 = default_timer()

sentiment = (campaign['tweet_text']
             .iloc[:1000]
             .apply(get_sentiment_apply)
            )
print('elapsed: {}'.format(default_timer() - t1))

elapsed: 146.13301213801606


In [53]:
df_abbrv = pd.DataFrame(campaign['tweet_text'].iloc[:1000])
df_abbrv['sentiment'] = sentiment

In [54]:
df_abbrv

Unnamed: 0_level_0,tweet_text,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
1271764746983952390,Tous ces gens qui s'en prennent à #Colbert alo...,neutral
907991739713118208,Why there's a bird sound? And why it's upside ...,neutral
1277789135470768129,Malawi is using bamboo to fight #climatechange...,positive
137282411095539712,terpikir terus bayanganmu..,neutral
1314271851988873251,😷Right in the middle of an illness😷 ..... #Bla...,neutral
...,...,...
1273570739535392768,"Over a thousand turn out to mourn Mr. Leung, t...",negative
1143385775842762752,#TheInvestigation is a magnificent 70-minute t...,positive
1178802943816032256,Pompeo took part in Trump-Zelensky call: repor...,negative
1142062694046019591,President Donald Trump downplayed Iran’s attac...,negative


## Cosine similarity

In [56]:
def get_embedding(text, model):
    text = preprocess_string(text)    
    encoded_input = tokenizer(text, return_tensors='pt')
    
    features = model(**encoded_input)
    features = features[0].detach().cpu().numpy() 
    features_mean = np.mean(features[0], axis=0) 
    
    return features_mean

In [65]:
query = documents.sample()[0][0]
data = documents.head(100)[0]
data

['America and the way its government treats its people must be reformed https://t.co/DupN3cE2dd',
 'https://t.co/ee5KZJMm5R',
 'Portland Police Bureau is preparing for a variety of mass gathering events being planned for Saturday, September 26, 2020. At this time, one group has been announced they will hold an event at Delta Park at noon.  #PoliceLivesMatter  #Polizeigewalt  #Portland  #PortlandProtests https://t.co/3JSoa0UcOL',
 'Why lies ..... why hypocrisy .... why security forces .... why racism ..... why cut budgets .... to deceive people    No Trump       No No   No #ARMY  #COVIDー19  #Biden #Trump https://t.co/v1WYXyidzJ',
 'Difference #Biden https://t.co/Vl5lB7q2l6',
 'He has destroyed every busines......🐂🐃🐃🐃🐃🐃 #TrumpMeltdown https://t.co/tKrPEdujCZ',
 'Even if the power to negotiate deals is handed over to the presidency, Congress will still have the final say, and judging by the current political climate, Johnson will not fire him this time #Trump  #Covid_19  #DemocracyDay htt

In [66]:
d = defaultdict(int)

for tweet in data:
    similarity = 1 - cosine(get_embedding(query, model), get_embedding(tweet, model))
    d[tweet] = similarity

print('Most similar to "{}"'.format(query))
for idx, x in enumerate(sorted(d.items(), key=lambda x: x[1], reverse=True)):
    print(idx + 1, x[0])
    if idx > 10:
        break

ValueError: Data of type <class 'torch.Tensor'> is not allowed only (<class 'tensorflow.python.framework.ops.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.file_utils.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

## Other datasets and models to use

In [None]:
from transformers.datasets import load_dataset

# Arabic Jordanian General Tweets (AJGT) Corpus
dataset = load_dataset("ajgt_twitter_ar")
# tweet_eval
dataset = load_dataset("tweet_eval", "emoji")
# hate speech detection (primarily English)
dataset = load_dataset("tweets_hate_speech_detection")