In [1]:
import tweepy
import pandas as pd
import numpy as np

In [2]:
with open('keys','r') as key_file:
    key = key_file.readline().strip()
    secret = key_file.readline().strip()
    bearer = key_file.readline().strip()

client = tweepy.Client(bearer)

In [3]:
# this a data source, given by any entity that uses the service
subjects = []
with open('subjects.txt','r') as subjects_file:
    subjects = [x.strip() for x in subjects_file.readlines()]

subjects

['nike', 'McDonalds']

In [4]:
# exemplify

query = f'@{subjects[0]}'

count = 10

search_result = client.search_recent_tweets(query=query,max_results=count)

In [27]:
df = pd.DataFrame(search_result.data,columns=['id','text'])
df['text'] = df['text'].astype(str)
df['subject']="nike"

### Sentiment Scoring

In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import csv
import urllib.request

def preprocess(text):
    new_text = []
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [8]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}" # load once

tokenizer_sentiment = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# pre trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [9]:
text = [preprocess(t) for t in list(df.text.values)]
encoded_input = tokenizer_sentiment(text, return_tensors='pt', truncation=True, padding=True)
output = model(**encoded_input)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
scores = output[0].detach().numpy()
scores = softmax(scores,axis=1)
ranking = np.argsort(scores)
ranking = ranking[:][::-1]

df['sentiment']=[labels[x] for x in np.argmax(scores,axis=1)]
df['negative']=scores[:,0]
df['neutral']=scores[:,1]
df['positive']=scores[:,2]


# for printing
# for t in range(scores.shape[0]):
#     print(f'text: {df.text.iloc[t]}')
#     for i in range(scores[0].shape[0]):
#         l = labels[ranking[t][i]]
#         s = scores[t][ranking[t][i]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")

### Emotion Scoring

In [11]:
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}" # load once

tokenizer_emotion = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# pre trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [12]:
text = [preprocess(t) for t in list(df.text.values)]
encoded_input = tokenizer_emotion(text, return_tensors='pt', padding=True)
output = model(**encoded_input)

In [13]:
scores = output[0].detach().numpy()
scores = softmax(scores,axis=1)
ranking = np.argsort(scores)
ranking = ranking[:][::-1]

df['emotion']=[labels[x] for x in np.argmax(scores,axis=1)]
df['anger']=scores[:,0]
df['joy']=scores[:,1]
df['optimism']=scores[:,2]
df['sadness']=scores[:,3]


# for printing
for t in range(3):
    print(f'text: {df.text.iloc[t]}')
    for i in range(scores[0].shape[0]):
        l = labels[ranking[t][i]]
        s = scores[t][ranking[t][i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

text: Sponsor me @Crocs @Nike https://t.co/SQpbd2x2MC
1) anger 0.0923
2) sadness 0.0504
3) optimism 0.3455
4) joy 0.5118
text: @PadrinoProxies @wytmerj @hoopszilla_ @ye4us @Nike @adidas https://t.co/LWZQlVZvVa
1) optimism 0.0578
2) sadness 0.0423
3) anger 0.0461
4) joy 0.8538
text: Bring this back for a LeBron commercial @Nike @KingJames https://t.co/QD8bCFpa2Z
1) sadness 0.0196
2) anger 0.0265
3) optimism 0.0577
4) joy 0.8962


##### 

In [29]:
df.text[0]

'Sponsor me @Crocs @Nike https://t.co/SQpbd2x2MC'

In [32]:
df.to_csv('tweets.csv',sep=';',index=False, quotechar='"')