# SNS SCRAPE

### Install SNS Scrape

In [215]:
!pip install snscrape



### Setup SNS Scrape for our Tweets

In [216]:
import snscrape.modules.twitter as sntwitter
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd


def run_the_tweet(keywords,incident_date, n_months, level_fame, language):
    date_format = '%Y-%m-%d'
    dtObj = datetime.strptime(incident_date, date_format)
    
    if level_fame == 'Legendary':
        min_retweets = 10
    elif level_fame == 'Superstar':
        min_retweets = 10
    elif level_fame == 'Mainstream':
        min_retweets = 2
    else:
        min_retweets = 0
    
    begin_date = (dtObj - relativedelta(months=n_months)).date()
    since_date = (dtObj - relativedelta(months=n_months)).date()
    
    until_date = since_date + timedelta(days=1)
    final_date = (dtObj + relativedelta(months=n_months)).date()
    
    n_days = (final_date-since_date).days
    
    query = " OR ".join(keywords)
    if len(keywords) == 1:
        query = keywords[0]
    tweets_list= []
    for x in range(n_days): #no. of days
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{query} since:{since_date.strftime(date_format)} until:{until_date.strftime(date_format)}  min_retweets:{min_retweets} lang:{language}').get_items()):
            if i > 50: #no. of output tweet
                break
            else:
                tweets_list.append([tweet.date, tweet.retweetCount, tweet.likeCount, tweet.content, tweet.user.username]) #append if statement satisfy
        if x < n_days : # no. of days you want to be return
            since_date = since_date + timedelta(days=1) #add another day
            until_date = until_date + timedelta(days=1) #add another day
        else:
            break
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime',"retweet","likes", 'Text', 'Username'])
    return tweets_df

### Clean Tweets

In [217]:
!pip install nltk



In [218]:
!pip install emot



In [219]:
!pip install emoji



In [220]:
pip install clean-text

Note: you may need to restart the kernel to use updated packages.


In [221]:
import re
import string
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import emoji
from cleantext import clean


import string
import re

from emot.emo_unicode import UNICODE_EMOJI

def clean_text(text):
    #changing to lowercase
    text = text.lower()
    
    # removing #´s 
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'#', '', text)
    
    #remove RT
    text = re.sub(r'RT[\s]+', '', text)
    
    #remove links
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r"www.\S+", "", text)
    
    #remove indentation
    text = re.sub(r'\n', '', text)

    #remove emojis
    text = clean(text, no_emoji=True)
    
    
    #tokenizing words
    tokens = word_tokenize(text)
    
    filtered_words = [w for w in tokens if w not in list(UNICODE_EMOJI.keys())]
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stefantonto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/stefantonto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


***
# SENTIMENT ANALYSIS

## INSTALL ROBERTA

In [222]:
!pip install transformers



In [223]:
!pip install torch



In [224]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [229]:
from transformers import pipeline

sentiment_task = pipeline("sentiment-analysis", model= model, tokenizer= tokenizer, top_k=3)

In [230]:
chris_brown_tweets = run_the_tweet(["chris brown"], "2022-08-07" ,1, "10", "en")

In [35]:
chris_brown_tweets['cleaned_tweet'] = chris_brown_tweets.Text.apply(clean_text)

In [36]:
chris_brown_tweets

Unnamed: 0,Datetime,retweet,likes,Text,Username,cleaned_tweet
0,2022-07-07 23:59:21+00:00,40,159,"Davido owe Shakespeare nothing, absolutely NOT...",30BG_CEO,"davido owe shakespeare nothing, absolutely not..."
1,2022-07-07 23:58:01+00:00,159,771,Davido and Chris brown is like Boli and ground...,Tobiloba_mide,davido and chris brown is like boli and ground...
2,2022-07-07 23:57:17+00:00,182,1045,Davido and Chris brown no Dey miss🔥🚨,malcom_xo,davido and chris brown no dey miss
3,2022-07-08 23:41:28+00:00,57,413,"Davido, Chris Brown &amp; Nobody Has To Know i...",lifeofolaa,"davido, chris brown & nobody has to know is tr..."
4,2022-07-08 23:38:02+00:00,97,776,Chris brown ft Davido Nobody has to know is 🙌🏻...,Amara_Layo,chris brown ft davido nobody has to know is ev...
...,...,...,...,...,...,...
181,2022-09-05 21:24:41+00:00,29,38,"Jay bahd be weed smoker, Philan be mmaa p3 kil...",kwesi_zitojnr,"jay bahd be weed smoker, philan be mmaa p3 kil..."
182,2022-09-05 20:51:42+00:00,16,95,|🚨\nExtra Spotify Update — Streams\n\n•Time N ...,RemaStats,|extra spotify update streamstime n affection ...
183,2022-09-06 22:23:42+00:00,26,104,I don't know if I am the only one but I feel “...,RabsonLee,"i don't know if i am the only one but i feel ""..."
184,2022-09-06 21:50:24+00:00,38,188,The Goat @davido and his fellow American Goat ...,30BG_CEO,the goat @davido and his fellow american goat ...


In [100]:
ts_tweets = run_the_tweet(["Taylor Swift"], "2022-07-31" ,1, "10", "en")

In [101]:
ts_tweets

Unnamed: 0,Datetime,retweet,likes,Text,Username
0,2022-06-30 23:47:21+00:00,49,246,Most streamed artists on Global Spotify (June ...,WORLDMUSICAWARD
1,2022-06-30 23:36:15+00:00,1065,12458,Taylor Swift is engaged to Joe Alwyn: report h...,nypost
2,2022-06-30 23:20:45+00:00,51,826,does it ever hit you that phoebe bridgers and ...,phoebesmoons
3,2022-06-30 22:58:03+00:00,1133,8686,me?????? obsessed?????? with????? Carolina by ...,klbegley15
4,2022-06-30 22:53:20+00:00,11,180,I miss watching updates about american weather...,icantunseeit
...,...,...,...,...,...
3157,2022-08-30 20:56:09+00:00,11,151,"📸 | @SabrinaAnnLynn, @taylorswift13, and Rosé ...",sabrinaupdatehq
3158,2022-08-30 20:54:36+00:00,29,210,"I'd rather listen to ""All Summer Long"" by Kid ...",NickAdamsinUSA
3159,2022-08-30 20:53:15+00:00,16,87,Swifties who openly use their account to mock ...,oneswiftlover
3160,2022-08-30 20:51:16+00:00,145,2161,‼️UPDATE: @taylorswift13's profile picture has...,TheSwiftSociety


In [102]:
ts_tweets['cleaned_tweet'] = ts_tweets.Text.apply(clean_text)

## SENTIMENT CLASSIFICATION

In [243]:
def get_sentiment_score_dictionary(df_cleanedtext):
    list_sentiments = []
    for tweet in df_cleanedtext.cleaned_tweet:
        sentiment_task_result = sentiment_task(tweet)[0]
        tweet_sentiment = {}
        for label in sentiment_task_result:
            if label['label'] == 'Neutral':
                tweet_sentiment['Neutral'] = label['score']
            elif label['label'] == 'Positive':
                tweet_sentiment['Positive'] = label['score']
            else:
                tweet_sentiment['Negative'] = label['score']
        list_sentiments.append(tweet_sentiment)
    return list_sentiments

def get_sentiment_score_classification(list_sentiments):
    sentiment_classifications = []
    for list_sentiment in list_sentiments:
        if (list_sentiment['Positive'] > list_sentiment['Negative']) and (list_sentiment['Positive'] > (list_sentiment['Neutral'] + list_sentiment['Negative'])):
            sentiment_classifications.append('Extremely Positive')
        elif (list_sentiment['Positive'] > list_sentiment['Negative']) and (list_sentiment['Positive'] < (list_sentiment['Neutral'] + list_sentiment['Negative'])):
            sentiment_classifications.append('Positive')
        elif (list_sentiment['Negative'] > list_sentiment['Positive']) and (list_sentiment['Negative'] > (list_sentiment['Neutral'] + list_sentiment['Positive'])):
            sentiment_classifications.append('Extremely Negative')
        elif (list_sentiment['Negative'] > list_sentiment['Positive']) and (list_sentiment['Negative'] < (list_sentiment['Neutral'] + list_sentiment['Negative'])):
            sentiment_classifications.append('Negative')
        else:
            sentiment_classifications.append('Neutral')
    return sentiment_classifications

### SENTIMENT CLASSIFICATION WITH TAYLOR SWIFT

In [104]:
classified_sentiment_ts = get_sentiment_score_classification(get_sentiment_score_dictionary(ts_tweets.cleaned_tweet))

In [105]:
pd.merge(ts_tweets, pd.DataFrame.from_dict(get_sentiment_score_dictionary(ts_tweets.cleaned_tweet)), left_index= True, right_index= True)

Unnamed: 0,Datetime,retweet,likes,Text,Username,cleaned_tweet,Neutral,Positive,Negative
0,2022-06-30 23:47:21+00:00,49,246,Most streamed artists on Global Spotify (June ...,WORLDMUSICAWARD,most streamed artists on global spotify (june ...,0.733290,0.256657,0.010054
1,2022-06-30 23:36:15+00:00,1065,12458,Taylor Swift is engaged to Joe Alwyn: report h...,nypost,taylor swift is engaged to joe alwyn: report,0.955908,0.024010,0.020082
2,2022-06-30 23:20:45+00:00,51,826,does it ever hit you that phoebe bridgers and ...,phoebesmoons,does it ever hit you that phoebe bridgers and ...,0.724014,0.260200,0.015786
3,2022-06-30 22:58:03+00:00,1133,8686,me?????? obsessed?????? with????? Carolina by ...,klbegley15,me?????? obsessed?????? with????? carolina by ...,0.040586,0.956709,0.002705
4,2022-06-30 22:53:20+00:00,11,180,I miss watching updates about american weather...,icantunseeit,i miss watching updates about american weather...,0.670205,0.159175,0.170620
...,...,...,...,...,...,...,...,...,...
3157,2022-08-30 20:56:09+00:00,11,151,"📸 | @SabrinaAnnLynn, @taylorswift13, and Rosé ...",sabrinaupdatehq,"| @sabrinaannlynn, @taylorswift13, and rose fr...",0.841150,0.155392,0.003458
3158,2022-08-30 20:54:36+00:00,29,210,"I'd rather listen to ""All Summer Long"" by Kid ...",NickAdamsinUSA,"i'd rather listen to ""all summer long"" by kid ...",0.280083,0.043955,0.675962
3159,2022-08-30 20:53:15+00:00,16,87,Swifties who openly use their account to mock ...,oneswiftlover,swifties who openly use their account to mock ...,0.167535,0.010010,0.822455
3160,2022-08-30 20:51:16+00:00,145,2161,‼️UPDATE: @taylorswift13's profile picture has...,TheSwiftSociety,update: @taylorswift13's profile picture has b...,0.893864,0.100866,0.005270


In [106]:
ts_df = pd.merge(ts_tweets, pd.DataFrame.from_dict(classified_sentiment_ts), left_index= True, right_index= True)

In [107]:
ts_df.rename(columns = {0:'sentiment_classification'}, inplace = True)

In [108]:
ts_df.sentiment_classification.value_counts()

Neutral     1700
Positive    1260
Negative     202
Name: sentiment_classification, dtype: int64

In [201]:
import os 

In [202]:
pwd

'/Users/stefantonto/code/srs366/musicians_v_cancellation/notebooks'

In [274]:
import pandas as pd
artists_df = pd.read_csv('../raw_data/artists.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../raw_data/artists.csv'

In [273]:
import numpy as np 

artists_df['NICKNAME'] = artists_df['NICKNAME'].apply(lambda x : x.split(", "))

AttributeError: 'list' object has no attribute 'split'

In [None]:
keywords,incident_date, n_months, level_fame, language

In [231]:
taylor_tweets_2 = run_the_tweet(artists_df.iloc[1]['NICKNAME'], artists_df.iloc[1]['DATE OF CANCELLATION'], 12, artists_df.iloc[1]['LEVEL FAME'], 'en')

taylor_tweets_2['cleaned_tweet'] = taylor_tweets_2.Text.apply(clean_text)

In [238]:
classified_sentiment_ts2 = get_sentiment_score_classification(get_sentiment_score_dictionary(taylor_tweets_2[['cleaned_tweet']]))

In [244]:
taylor_tweets_3 = run_the_tweet(artists_df.iloc[1]['NICKNAME'], artists_df.iloc[1]['DATE OF CANCELLATION'], 1, artists_df.iloc[1]['LEVEL FAME'], 'en')

taylor_tweets_3['cleaned_tweet'] = taylor_tweets_3.Text.apply(clean_text)

In [247]:
tt3_sentscores = get_sentiment_score_dictionary(taylor_tweets_3[['cleaned_tweet']])

In [250]:
taylor_scores_only = pd.merge(taylor_tweets_3, pd.DataFrame.from_dict(tt3_sentscores), left_index= True, right_index= True)

In [292]:
taylor_df = pd.merge(taylor_scores_only, pd.DataFrame.from_dict(get_sentiment_score_classification(tt3_sentscores)), right_index=True, left_index=True)

In [294]:
taylor_df.tail()

Unnamed: 0,Datetime,retweet,likes,Text,Username,cleaned_tweet,Positive,Neutral,Negative,0
3157,2022-08-31,74,588,Taylor Swift's best days on Spotify in 2022\n#...,SpotifySwiftie,taylor swift's best days on spotify in 2022 37...,0.952261,0.045677,0.002062,Extremely Positive
3158,2022-08-31,38,396,just got a work email that said “i hope you ar...,brittanygibsonn,"just got a work email that said ""i hope you ar...",0.684937,0.306684,0.008378,Extremely Positive
3159,2022-08-31,19,80,"New releases are coming from Taylor Swift, Joh...",TIME,"new releases are coming from taylor swift, joh...",0.572115,0.424348,0.003536,Extremely Positive
3160,2022-08-31,20,460,i think hannah waddingham should pull a taylor...,hadyoubigtime,i think hannah waddingham should pull a taylor...,0.13888,0.826264,0.034856,Positive
3161,2022-08-31,60,903,Taylor Swift has now surpassed 7 Billion strea...,spotify_swift,taylor swift has now surpassed 7 billion strea...,0.702198,0.29358,0.004222,Extremely Positive


In [254]:
taylor_df.rename(columns = {0:'classified_sentiment'}, inplace = True)

In [263]:
import datetime

In [293]:
taylor_df['Datetime'] = taylor_df['Datetime'].apply(lambda x: str(x)[0:10])

#taylor_df['Datetime'] = taylor_df['Datetime'].apply(lambda x: x[0:10])