In [1]:
import tweepy
import pandas as pd
import numpy as np
import csv
import urllib.request

In [2]:
# download label mapping

task = f'sentiment'

labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [3]:
with open('keys','r') as key_file:
    key = key_file.readline().strip()
    secret = key_file.readline().strip()
    bearer = key_file.readline().strip()

client = tweepy.Client(bearer)

In [4]:
# this a data source, given by any entity that uses the service
subjects = []
with open('subjects.txt','r') as subjects_file:
    subjects = [x.strip() for x in subjects_file.readlines()]

subjects

['nike', 'adidas', 'mcdonalds', 'starbucks', 'samsung', 'apple']

### Use Case: Starbucks

In [14]:
def preprocess(text):
    new_text = []

    text = text.replace('\n',' ')
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)

    new_text = " ".join(new_text)

    new_text = new_text.replace("@user","").strip()
    new_text = new_text.replace("http","").strip()
    new_text = new_text.replace("&amp;"," and ").strip() # caveat

    return new_text

def dummy_sentiment(data, labels):

    scores = np.random.dirichlet(np.ones(3),size=(data.shape[0]))
    np.random.shuffle(scores)

    data['sentiment']=[labels[x] for x in np.argmax(scores,axis=1)]
    data['negative']=scores[:,0]
    data['neutral']=scores[:,1]
    data['positive']=scores[:,2]

    return data

def fetch_tt_data(brand,query='', size='small', days=0, save=False, dummy=False, labels=[]):
    '''
    Fetch recent twitter data
    ---
    query:
    size:
    days: int
        Minimum tweet age. E.g., if days=2, will fetch tweets that are at least 2 days old, and up to a maximum of 7
    '''

    sizes = {
        'small': 10000,
        'medium': 50000,
        'large': 100000
    }

    #query = '(@starbucks OR starbucks)'

    end_time = pd.to_datetime(pd.Timestamp.now()-pd.to_timedelta(10,unit='s')) # mandatory
    if days > 0:
        end_time -= pd.to_timedelta(days, unit='days')

    count = 100

    tweets = tweepy.Paginator(
        client.search_recent_tweets, 
        query=query,
        max_results=count,
        end_time= end_time,
        tweet_fields=['created_at','public_metrics']
    ).flatten(limit=sizes[size])

    df = pd.DataFrame(data=[t for t in tweets])

    if 'withheld' in df: #  remove withheld content
        df = df[df['withheld'].isnull()]

    df['text'] = df['text'].astype(str)
    df['text_tokenized'] = [preprocess(t) for t in df['text'].values]
    df['brand']=brand
    df[['retweets','likes']]=df['public_metrics'].apply(pd.Series)[['retweet_count','like_count']]
    
    columns_to_drop = ['edit_history_tweet_ids','public_metrics']

    if 'withheld' in df.columns:
        columns_to_drop.append('withheld')

    df = df.drop(columns=columns_to_drop, axis=0)

    if (dummy):
        df = dummy_sentiment(df, labels)

    if (save):
        df_save = df.drop(columns=['text'], axis=1)
        df.to_csv(f'datasets/tweets_{brand}_{size}.csv',sep=',', index=False)

    return df

In [29]:
# exemplify
df = fetch_tt_data(
    brand='sony',
    query = 'entity:"sony" -is:retweet lang:en -from:sony',
    days=2,
    dummy=True,
    labels=labels,
    save=True
)

df.head()

Unnamed: 0,created_at,id,text,text_tokenized,brand,retweets,likes,sentiment,negative,neutral,positive
0,2022-11-21 00:37:53+00:00,1594490245780836352,@crisperstorm I think it's more of a call back...,I think it's more of a call back to when infin...,sony,0,0,positive,0.093658,0.243139,0.663203
1,2022-11-21 00:37:51+00:00,1594490239476764678,I finally got around to watching #Morbius aaaa...,I finally got around to watching #Morbius aaaa...,sony,0,0,positive,0.253376,0.188635,0.557989
2,2022-11-21 00:36:37+00:00,1594489927366021120,@BirkinWill @psychonauts8 @MarcinIsHere I'd gu...,I'd guess PS has a mid-gen refresh first. It'l...,sony,1,3,neutral,0.003396,0.689539,0.307065
3,2022-11-21 00:36:00+00:00,1594489771643895808,@ElijahYak @Robert0726Rolfe @xMBGx I don’t giv...,I don’t give a SHIT about the praises Sony for...,sony,0,0,negative,0.919803,0.007504,0.072694
4,2022-11-21 00:34:26+00:00,1594489375819268104,"@getFANDOM WE dont want nor need this, WE want...","WE dont want nor need this, WE want Marvel Stu...",sony,0,1,negative,0.382888,0.349801,0.267311


In [17]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:f532430fbb@localhost:5432/feedbaq')

def export_to_db(data, engine, table_name="tweets"):

    # format for database
    data = data.drop(['text'], axis=1)

    data = data.rename(
        columns={
            'id':'tweet_id',
            'text_tokenized':'text'
        }
    )
    
    data.to_sql(table_name, engine, index=False, if_exists='append')

In [None]:
export_to_db(df,engine)

### Sentiment Scoring

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer_sentiment = AutoTokenizer.from_pretrained(MODEL)

# pre trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
text = [preprocess(t) for t in list(df.text.values)]
encoded_input = tokenizer_sentiment(text, return_tensors='pt', truncation=True, padding=True)
output = model(**encoded_input)

In [None]:
scores = output[0].detach().numpy()
scores = softmax(scores,axis=1)
ranking = np.argsort(scores)
ranking = ranking[:][::-1]

df['sentiment']=[labels[x] for x in np.argmax(scores,axis=1)]
df['negative']=scores[:,0]
df['neutral']=scores[:,1]
df['positive']=scores[:,2]


# for printing
# for t in range(scores.shape[0]):
#     print(f'text: {df.text.iloc[t]}')
#     for i in range(scores[0].shape[0]):
#         l = labels[ranking[t][i]]
#         s = scores[t][ranking[t][i]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")

### Emotion Scoring

In [None]:
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}" # load once

tokenizer_emotion = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# pre trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
text = [preprocess(t) for t in list(df.text.values)]
encoded_input = tokenizer_emotion(text, return_tensors='pt', padding=True)
output = model(**encoded_input)

In [None]:
scores = output[0].detach().numpy()
scores = softmax(scores,axis=1)
ranking = np.argsort(scores)
ranking = ranking[:][::-1]

df['emotion']=[labels[x] for x in np.argmax(scores,axis=1)]
df['anger']=scores[:,0]
df['joy']=scores[:,1]
df['optimism']=scores[:,2]
df['sadness']=scores[:,3]


# for printing
for t in range(3):
    print(f'text: {df.text.iloc[t]}')
    for i in range(scores[0].shape[0]):
        l = labels[ranking[t][i]]
        s = scores[t][ranking[t][i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

##### 

In [None]:
df.to_csv('tweets.csv',sep=';',index=False, quotechar='"')