In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import numpy as np
import hvplot.pandas
from transformers import pipeline

sentiment_pipeline = pipeline(model='cardiffnlp/twitter-roberta-base-sentiment')


  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|█| 970/


In [2]:
def tweets(n_tweets, search_term, start_date, end_date):
    """
    get a dataframe of tweets by search term
    
    ref: https://betterprogramming.pub/how-to-scrape-tweets-with-snscrape-90124ed006af
    """
    # Creating list to append tweet data to
    tweets_list2 = []

    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{search_term} since:{start_date} until:{end_date}').get_items()):
        if i>n_tweets:
            break
        tweets_list2.append([tweet.date, tweet.id, tweet.content])

    # Creating a dataframe from the tweets list above
    tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 'Tweet Id', 'Text'])
    return tweets_df2

# Get tweets

In [3]:
df_camille df_camille = tweets(10000, 'camille vasquez', '2022-05-18', '2022-05-20')
df_johnny = tweets(10000, 'johnny depp', '2022-05-18', '2022-05-20')
df_amber = tweets(10000, 'amber heard', '2022-05-18', '2022-05-20')
df_shannon = tweets(10000, 'shannon curry', '2022-05-01', '2022-05-20')= pd.read_csv('df_camille_2day.csv', index_col=0)


In [4]:
len(df_camille), len(df_johnny), len(df_amber), len(df_shannon)

(8772, 10001, 10001, 1985)

In [5]:
df_camille.head()

Unnamed: 0,Datetime,Tweet Id,Text
0,2022-05-19 23:59:34+00:00,1527438848392847362,What IS the truth about the relationship betwe...
1,2022-05-19 23:58:47+00:00,1527438648945303566,Camille Vasquez ENRAGED by Elaine during Amber...
2,2022-05-19 23:58:14+00:00,1527438512995565568,ginisa ni camille vásquez si amber heard eh HW...
3,2022-05-19 23:56:21+00:00,1527438037256396807,Camille Vasquez: 5 Things To Know About Johnny...
4,2022-05-19 23:55:49+00:00,1527437905450262528,"@RkFutbol He stays at PSG, he is gonna be so h..."


# Let's start with one dataframe

In [6]:
df = df_camille

In [9]:
# DATA CLEANING 
# Make everything lower case
df = df.assign(Text=lambda x: x['Text'].str.lower())

In [11]:
# Remove the rows that mentioned the other three people.
df = df.query('~Text.str.contains("amber|heard|johnny|depp|shannon|curry")')

In [16]:
# Keep the rows that mention name 
df = df.query('Text.str.contains("camille|vasquez")')

In [26]:
# Remove all the URLs
df = df.assign(Text=lambda x:x['Text'].apply(lambda s: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', s)))

In [36]:
# SENTIMENT ANALYSIS 
df = df.assign(sentiment=lambda x: x['Text'].apply(lambda s: sentiment_pipeline(s)))

In [38]:
# Get the sentiment label
df = df.assign(sentiment_label=lambda x: x['sentiment'].apply(lambda s: s[0]['label']))

In [40]:
# Get the sentiment score
df = df.assign(sentiment_score=lambda x: x['sentiment'].apply(lambda s: s[0]['score']))

In [42]:
# Recode the sentiment labels
df = df.assign(
    sentiment_label=np.where(
        df['sentiment_label']=='LABEL_0', 'NEGATIVE', np.where(
            df['sentiment_label']=='LABEL_2', 'POSITIVE', 'NEUTRAL'
        )
    )
)

# Data processing for all four data frames

In [48]:
df_dict = {
    'camille vasquez': df_camille,
    'johnny depp': df_johnny,
    'amber heard': df_amber,
    'shannon curry': df_shannon
}

In [49]:
remove_rows = {
    'camille vasquez': 'amber|heard|johnny|depp|shannon|curry',
    'johnny depp': 'amber|heard|camille|vasquez|shannon|curry',
    'amber heard': 'johnny|depp|camille|vasquez|shannon|curry',
    'shannon curry': 'johnny|depp|camille|vasquez|amber|heard',
}

In [50]:
keep_rows = {
    'camille vasquez': 'camille|vasquez',
    'johnny depp': 'johnny|depp',
    'amber heard': 'amber|heard',
    'shannon curry': 'shannon|curry',
}

In [51]:
# DATA CLEANING 
for key, df in df_dict.items():
    print(key)
    df_dict[key] = (
        df
        # Make everything lower case
        .assign(Text=lambda x: x['Text'].str.lower())
        # Remove the rows that mentioned the other three people.
        .query(f'~Text.str.contains("{remove_rows[key]}")')
        # Keep the rows that mention name 
        .query(f'Text.str.contains("{keep_rows[key]}")')
        # Remove all the URLs
        .assign(Text=lambda x:x['Text'].apply(lambda s: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', s)))
    )
    print("# Tweets: ", len(df_dict[key]))

camille vasquez
# Tweets:  2517
johnny depp
# Tweets:  4356
amber heard
# Tweets:  4011
shannon curry
# Tweets:  712


In [52]:
# SENTIMENT ANALYSIS
for key, df in df_dict.items():
    df_dict[key] = (
        df_dict[key]
        # Apply the pre-trained sentiment model
        .assign(sentiment=lambda x: x['Text'].apply(lambda s: sentiment_pipeline(s)))
        # Get the sentiment label
        .assign(sentiment_label=lambda x: x['sentiment'].apply(lambda s: s[0]['label']))
        # Get the sentiment score
        .assign(sentiment_score=lambda x: x['sentiment'].apply(lambda s: s[0]['score']))
    )
    # Recode the sentiment labels
    df_dict[key] = df_dict[key].assign(
        sentiment_label=np.where(
            df_dict[key]['sentiment_label']=='LABEL_0', 'NEGATIVE', np.where(
                df_dict[key]['sentiment_label']=='LABEL_2', 'POSITIVE', 'NEUTRAL'
            )
        )
    )


In [61]:
dfc = pd.concat([
    df_dict['johnny depp'].query('sentiment_score>0.8')['sentiment_label'].value_counts(),
    df_dict['amber heard'].query('sentiment_score>0.8')['sentiment_label'].value_counts(),
    df_dict['camille vasquez'].query('sentiment_score>0.8')['sentiment_label'].value_counts(),
    df_dict['shannon curry'].query('sentiment_score>0.8')['sentiment_label'].value_counts()
], axis=1)
dfc

Unnamed: 0,sentiment_label,sentiment_label.1,sentiment_label.2,sentiment_label.3
NEUTRAL,908,668,448,164
NEGATIVE,577,897,130,62
POSITIVE,351,93,664,144


In [62]:
dfc.columns = ['Johnny Depp', 'Amber Heard', 'Camille Vasquez', 'Shannon Curry']

In [63]:
dfc

Unnamed: 0,Johnny Depp,Amber Heard,Camille Vasquez,Shannon Curry
NEUTRAL,908,668,448,164
NEGATIVE,577,897,130,62
POSITIVE,351,93,664,144


In [65]:
dfp = dfc/dfc.sum()

In [74]:
dfp.T

Unnamed: 0,NEUTRAL,NEGATIVE,POSITIVE
Johnny Depp,0.494553,0.31427,0.191176
Amber Heard,0.402895,0.541013,0.056092
Camille Vasquez,0.360709,0.10467,0.534622
Shannon Curry,0.443243,0.167568,0.389189


In [87]:
dfp.T.hvplot.bar(
    rot=90,
    title='Ratio of Tweets by Sentiment',
    color=["#ffcc5c", "#ff6f69", "#88d8b0"]
)

# N-gram analysis

In [78]:
df = df_dict['camille vasquez']

In [79]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer

c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(df['Text'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})


In [81]:
df_ngram.head(10)

Unnamed: 0,frequency,bigram/trigram
0,2100,camille vasquez
1,104,camille vásquez
2,57,love camille vasquez
3,57,love camille
4,53,cross examination
5,36,like camille
6,33,want camille
7,31,want camille vasquez
8,31,gt gt
9,30,like camille vasquez


# Topic modeling 

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
nmf = NMF(n_components=5)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(df['Text'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=5)

Topic #0: camille vasquez, crush camille vasquez, crush camille, camille vasquez badass, vasquez badass
Topic #1: love camille vasquez, love camille, think love camille, think love, camille vasquez
Topic #2: camille vasquez queen, vasquez queen, camille vasquez, vasquez queen love, queen truly
Topic #3: want camille, want camille vasquez, vasquez grow, camille vasquez grow, grow want
Topic #4: vasquez tweet, camille vasquez tweet, vasquez tweet love, tweet love, camille vasquez





# Other 

In [86]:
# the pre-trained sentiment model can't categorize some tweets correctly. 
# For example, the model categorize "bad ass" as negative
sentiment_pipeline("bad ass")

[{'label': 'LABEL_0', 'score': 0.8163448572158813}]