In [None]:
!pip install tensorflow
!pip install tokenizers
!pip install transformers

In [1]:
import collections
from collections import defaultdict
import csv
import functools
import itertools
import re
import string
import timeit
from timeit import default_timer
import urllib.request

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModel, TFAutoModel

from utils import *
from topic_utils import *

### Data

In [2]:
users = UsersData('data/users')
tweets = TweetsData('data/tweets')

In [3]:
df = tweets.df.loc[:][:]
user_df = users.df.loc[:][:]

In [4]:
campaign = df[df['campaign'] == 'iran202012']
campaign_users = user_df[user_df['campaign'] == 'iran202012']

# Sentiment Analysis

Download the Cardiff NLP Twitter pre-trained roBERTa base model:

In [5]:
task = 'sentiment'

MODEL = "cardiffnlp/twitter-roberta-base-{}".format(task)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [14]:
model.config

RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [None]:
def get_sentiment(documents):
    
    sentiments = []

    for doc in documents:
        encoded_input = tokenizer(doc, return_tensors='tf')
        output = model(encoded_input)
        scores = output[0][0].numpy()
        scores = softmax(scores)

        sentiments.append(labels[np.argmax(scores)])

    return dict(zip(documents, sentiments))

In [50]:
def get_model(model, task):
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.add_tokens(['[HTAG]', '[URL]', '[AT]'])
    
    return model, tokenizer
    
def get_labels(task):
    mapping_link = ('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{}/mapping.txt'.format(task))
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    return [row[1] for row in csvreader if len(row) > 1]
    
def predict_sentiment(document):
    # preprocess
    document = preprocess_string(document)
    
    # tokenize
    encoded_input = tokenizer(document, return_tensors='tf')
    
    # apply model
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)

    return labels[np.argmax(scores)]#, np.max(scores)

In [8]:
def get_sentiment_apply(document):
    """Usage: df.apply(get_sentiment_apply)"""
    encoded_input = tokenizer(document, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)

    return labels[np.argmax(scores)]#, np.max(scores)

In [7]:
task = 'sentiment'
MODEL = 'cardiffnlp/twitter-roberta-base-{}'.format(task)

model, tokenizer = get_model(MODEL, task)

labels = get_labels(task)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Apply only to English tweets (model has been trained on English):

In [61]:
documents = (campaign
 .groupby('tweet_language')
 .get_group('en')['tweet_text']
).head(100)

documents.iloc[0]

"Why there's a bird sound? And why it's upside down? https://t.co/FK8pOY2PqU"

In [62]:
# todo: make this much faster
t1 = default_timer()

sentiment = (documents
             .apply(preprocess_string)
             .apply(predict_sentiment)
            )
print('elapsed: {}'.format(default_timer() - t1))

elapsed: 11.718736473005265


In [63]:
df_abbrv = pd.DataFrame(documents)
df_abbrv['sentiment'] = sentiment
df_abbrv

Unnamed: 0_level_0,tweet_text,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
907991739713118208,Why there's a bird sound? And why it's upside ...,neutral
1277789135470768129,Malawi is using bamboo to fight #climatechange...,positive
1314271851988873251,😷Right in the middle of an illness😷 ..... #Bla...,negative
1298084197773520898,"Scotland’s Climate Assembly, doing politics di...",neutral
1291999805770694656,It's an urgent warning from climate scientists...,neutral
...,...,...
1305107719553257473,Donald Trump is the number one culprit in the ...,negative
1056696287427522561,RT @ClevelandClinic: The surprising health ben...,positive
1239511478904016897,#BernieWon 11th Democratic debate &amp; as a w...,negative
1150032184780234752,First set: *Williams 0-2 Halep. Let's Go Seren...,neutral


In [64]:
(df_abbrv
.groupby('sentiment')
.get_group('negative')
).head(10)

Unnamed: 0_level_0,tweet_text,sentiment
tweetid,Unnamed: 1_level_1,Unnamed: 2_level_1
1314271851988873251,😷Right in the middle of an illness😷 ..... #Bla...,negative
1281483637997940736,Economic inequality is out of control... This ...,negative
1302115414919401473,DR. Anthony Fauci: I congratulate Russians to ...,negative
1267652700537315331,I can't breath... #HumanRights #HumanRightsVi...,negative
1289552966752460802,"Alexander Vindman retires,cites 'bullying' by ...",negative
1269827106042720258,This is what oppression of a black live looks ...,negative
1222143276951601153,"I don't understand his language, but my heart ...",negative
1299400387078881281,can't name one because he ruined everything fo...,negative
1300830299786543104,The Role of Law Enforcement in Supporting Pede...,negative
1302039770076516354,A significant majority of Americans believe Pr...,negative


## Event-related sentiment
- get 

In [65]:
grouped = (campaign
 .reset_index()
 .set_index('tweet_time')
 .resample('D')[['tweetid','userid','tweet_text']]
)

top_day = (grouped
 .size()
 .sort_values(ascending=False)
).idxmax()

top_day_tweets = grouped.get_group(top_day)
top_day_tweets.head(5)

Unnamed: 0_level_0,tweetid,userid,tweet_text
tweet_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-17 00:02:00,1295149036681474048,213589457,RT @irXcL8Y80STibhjlS3a48BahFUg3yqPT+XtTxkpLL8...
2020-08-17 00:02:00,1295149057372033024,jWkurFac8vhkrkKlTqvhjp4rwEVXCuZ7uOhVc+bs1ZY=,@safewordthe2nd @Mahyargdrz عه نه! چرا!؟ پس کی...
2020-08-17 00:15:00,1295152130546900995,213589457,Una fuente saudí afirma que el príncipe herede...
2020-08-17 00:30:00,1295155883819667458,etDaWEjMPleueDrpkatPUSCApc6yU8W95+yZYWzVxSY=,Dirgahayu RI ke 75 Jangan sia-siain jasa pah...
2020-08-17 00:30:00,1295155884205789185,etDaWEjMPleueDrpkatPUSCApc6yU8W95+yZYWzVxSY=,"""Kita berutang kepada Palestina. Palestina yan..."
