# SNS SCRAPE

### Install SNS Scrape

In [1]:
!pip install snscrape



### Setup SNS Scrape for our Tweets

In [98]:
import snscrape.modules.twitter as sntwitter
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd


def run_the_tweet(keywords,incident_date, n_months, min_retweets, language):
    date_format = '%Y-%m-%d'
    dtObj = datetime.strptime(incident_date, date_format)
    
    begin_date = (dtObj - relativedelta(months=n_months)).date()
    since_date = (dtObj - relativedelta(months=n_months)).date()
    
    until_date = since_date + timedelta(days=1)
    final_date = (dtObj + relativedelta(months=n_months)).date()
    
    n_days = (final_date-since_date).days
    
    query = " OR ".join(keywords)
    if len(keywords) == 1:
        query = keywords[0]
    tweets_list= []
    for x in range(n_days): #no. of days
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{query} since:{since_date.strftime(date_format)} until:{until_date.strftime(date_format)}  min_retweets:{min_retweets} lang:{language}').get_items()):
            if i > 50: #no. of output tweet
                break
            else:
                tweets_list.append([tweet.date, tweet.retweetCount, tweet.likeCount, tweet.content, tweet.user.username]) #append if statement satisfy
        if x < n_days : # no. of days you want to be return
            since_date = since_date + timedelta(days=1) #add another day
            until_date = until_date + timedelta(days=1) #add another day
        else:
            break
    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime',"retweet","likes", 'Text', 'Username'])
    return tweets_df

### Clean Tweets

In [3]:
!pip install nltk



In [4]:
!pip install emot



In [5]:
!pip install emoji



In [6]:
pip install clean-text

Note: you may need to restart the kernel to use updated packages.


In [99]:
import re
import string
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import emoji
from cleantext import clean


import string
import re

from emot.emo_unicode import UNICODE_EMOJI

def clean_text(text):
    #changing to lowercase
    text = text.lower()
    
    # removing #´s 
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'#', '', text)
    
    #remove RT
    text = re.sub(r'RT[\s]+', '', text)
    
    #remove links
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r"www.\S+", "", text)
    
    #remove indentation
    text = re.sub(r'\n', '', text)

    #remove emojis
    text = clean(text, no_emoji=True)
    
    
    #tokenizing words
    tokens = word_tokenize(text)
    
    filtered_words = [w for w in tokens if w not in list(UNICODE_EMOJI.keys())]
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stefantonto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/stefantonto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


***
# SENTIMENT ANALYSIS

In [9]:
!pip install transformers



In [10]:
!pip install torch

Collecting torch
  Downloading torch-1.12.1-cp38-none-macosx_10_9_x86_64.whl (137.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.8/137.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch
Successfully installed torch-1.12.1


In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

  from .autonotebook import tqdm as notebook_tqdm
Downloading pytorch_model.bin: 100%|█████████| 478M/478M [01:27<00:00, 5.76MB/s]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
from transformers import pipeline

sentiment_task = pipeline("sentiment-analysis", model= model, tokenizer= tokenizer, top_k=3)

In [34]:
chris_brown_tweets = run_the_tweet(["chris brown"], "2022-08-07" ,1, "10", "en")

In [35]:
chris_brown_tweets['cleaned_tweet'] = chris_brown_tweets.Text.apply(clean_text)

In [36]:
chris_brown_tweets

Unnamed: 0,Datetime,retweet,likes,Text,Username,cleaned_tweet
0,2022-07-07 23:59:21+00:00,40,159,"Davido owe Shakespeare nothing, absolutely NOT...",30BG_CEO,"davido owe shakespeare nothing, absolutely not..."
1,2022-07-07 23:58:01+00:00,159,771,Davido and Chris brown is like Boli and ground...,Tobiloba_mide,davido and chris brown is like boli and ground...
2,2022-07-07 23:57:17+00:00,182,1045,Davido and Chris brown no Dey miss🔥🚨,malcom_xo,davido and chris brown no dey miss
3,2022-07-08 23:41:28+00:00,57,413,"Davido, Chris Brown &amp; Nobody Has To Know i...",lifeofolaa,"davido, chris brown & nobody has to know is tr..."
4,2022-07-08 23:38:02+00:00,97,776,Chris brown ft Davido Nobody has to know is 🙌🏻...,Amara_Layo,chris brown ft davido nobody has to know is ev...
...,...,...,...,...,...,...
181,2022-09-05 21:24:41+00:00,29,38,"Jay bahd be weed smoker, Philan be mmaa p3 kil...",kwesi_zitojnr,"jay bahd be weed smoker, philan be mmaa p3 kil..."
182,2022-09-05 20:51:42+00:00,16,95,|🚨\nExtra Spotify Update — Streams\n\n•Time N ...,RemaStats,|extra spotify update streamstime n affection ...
183,2022-09-06 22:23:42+00:00,26,104,I don't know if I am the only one but I feel “...,RabsonLee,"i don't know if i am the only one but i feel ""..."
184,2022-09-06 21:50:24+00:00,38,188,The Goat @davido and his fellow American Goat ...,30BG_CEO,the goat @davido and his fellow american goat ...


In [89]:
ts_tweets = run_the_tweet(["Taylor Swift"], "2022-07-31" ,1, "10", "en")

In [90]:
ts_tweets

Unnamed: 0,Datetime,retweet,likes,Text,Username
0,2022-06-30 23:47:21+00:00,49,245,Most streamed artists on Global Spotify (June ...,WORLDMUSICAWARD
1,2022-06-30 23:36:15+00:00,1065,12458,Taylor Swift is engaged to Joe Alwyn: report h...,nypost
2,2022-06-30 23:20:45+00:00,51,826,does it ever hit you that phoebe bridgers and ...,phoebesmoons
3,2022-07-01 23:56:00+00:00,24,167,@taylorswift13 We will forever be streaming Ta...,TheSwiftSociety
4,2022-07-01 23:51:09+00:00,51,404,#StrangerThings and taylor swift parallels htt...,W00DVALE
...,...,...,...,...,...
181,2022-08-29 23:57:14+00:00,31,257,🚨| @taylorswift13’s #MidnighTS announcement ha...,TayliviaTeam
182,2022-08-29 23:54:47+00:00,75,1506,"@leosneedy @PopBase @taylorswift13 Yes, and it...",mainpopgirI
183,2022-08-30 23:59:17+00:00,26,328,i’ve never been able to afford tickets to see ...,ghostofeste
184,2022-08-30 23:59:04+00:00,151,1789,dylan o’brien as taylor swifts “midnights” htt...,visionaryfxs


In [91]:
ts_tweets['cleaned_tweet'] = ts_tweets.Text.apply(clean_text)

In [92]:
def get_sentiment_score_dictionary(df_cleanedtext):
    list_sentiments = []
    for tweet in ts_tweets.cleaned_tweet:
        sentiment_task_result = sentiment_task(tweet)[0]
        tweet_sentiment = {}
        for label in sentiment_task_result:
            if label['label'] == 'Neutral':
                tweet_sentiment['Neutral'] = label['score']
            elif label['label'] == 'Positive':
                tweet_sentiment['Positive'] = label['score']
            else:
                tweet_sentiment['Negative'] = label['score']
        list_sentiments.append(tweet_sentiment)
    return list_sentiments

def get_sentiment_score_classification(list_sentiments):
    sentiment_classifications = []
    for list_sentiment in list_sentiments:
        if (list_sentiment['Positive'] > list_sentiment['Negative']) and (list_sentiment['Positive'] > list_sentiment['Neutral']):
            sentiment_classifications.append('Positive')
        elif (list_sentiment['Neutral'] > list_sentiment['Negative']) and (list_sentiment['Positive'] < list_sentiment['Neutral']):
            sentiment_classifications.append('Neutral')
        else:
            sentiment_classifications.append('Negative')
    return sentiment_classifications

In [93]:
classified_sentiment_ts = get_sentiment_score_classification(get_sentiment_score_dictionary(ts_tweets.cleaned_tweet))

In [94]:
pd.merge(ts_tweets, pd.DataFrame.from_dict(get_sentiment_score_dictionary(ts_tweets.cleaned_tweet)), left_index= True, right_index= True)

Unnamed: 0,Datetime,retweet,likes,Text,Username,cleaned_tweet,Neutral,Positive,Negative
0,2022-06-30 23:47:21+00:00,49,245,Most streamed artists on Global Spotify (June ...,WORLDMUSICAWARD,most streamed artists on global spotify (june ...,0.733290,0.256657,0.010054
1,2022-06-30 23:36:15+00:00,1065,12458,Taylor Swift is engaged to Joe Alwyn: report h...,nypost,taylor swift is engaged to joe alwyn: report,0.955908,0.024010,0.020082
2,2022-06-30 23:20:45+00:00,51,826,does it ever hit you that phoebe bridgers and ...,phoebesmoons,does it ever hit you that phoebe bridgers and ...,0.724014,0.260200,0.015786
3,2022-07-01 23:56:00+00:00,24,167,@taylorswift13 We will forever be streaming Ta...,TheSwiftSociety,@taylorswift13 we will forever be streaming ta...,0.145578,0.851489,0.002932
4,2022-07-01 23:51:09+00:00,51,404,#StrangerThings and taylor swift parallels htt...,W00DVALE,and taylor swift parallels,0.881341,0.101206,0.017453
...,...,...,...,...,...,...,...,...,...
181,2022-08-29 23:57:14+00:00,31,257,🚨| @taylorswift13’s #MidnighTS announcement ha...,TayliviaTeam,| @taylorswift13's announcement has crossed 1 ...,0.083631,0.914424,0.001945
182,2022-08-29 23:54:47+00:00,75,1506,"@leosneedy @PopBase @taylorswift13 Yes, and it...",mainpopgirI,"@leosneedy @popbase @taylorswift13 yes, and it...",0.798770,0.166449,0.034781
183,2022-08-30 23:59:17+00:00,26,328,i’ve never been able to afford tickets to see ...,ghostofeste,i've never been able to afford tickets to see ...,0.217211,0.749612,0.033177
184,2022-08-30 23:59:04+00:00,151,1789,dylan o’brien as taylor swifts “midnights” htt...,visionaryfxs,"dylan o'brien as taylor swifts ""midnights""",0.943628,0.047208,0.009165


In [95]:
ts_df = pd.merge(ts_tweets, pd.DataFrame.from_dict(classified_sentiment_ts), left_index= True, right_index= True)

In [96]:
ts_df.rename(columns = {0:'sentiment_classification'}, inplace = True)

In [97]:
ts_df.sentiment_classification.value_counts()

Neutral     114
Positive     61
Negative     11
Name: sentiment_classification, dtype: int64

In [67]:
import pandas as pd

df = pd.read_csv('../raw_data/artist_data.csv')

In [68]:
df['ARTIST NICKNAME'] = df['ARTIST NICKNAME'].apply(lambda x : x.split(", "))

In [64]:
df.head(1)

Unnamed: 0,ARTIST,DATE OF CANCELLATION,GENDER,RACE,GENRE,LEVEL FAME,REASON FOR CANCELLATION,TWITTER HANDLE,TWITTER FOLLOWERS,CHARTMETRIC ID,ARTIST NICKNAME,CANCELLED
0,Nicki Minaj,2021-09-13,Female,Black,Hip-hop,Superstar,Spreading COVID misinformation,@NICKIMINAJ,26000000,3442,[Nicki Minaj],cancelled


In [69]:
df['ARTIST NICKNAME'].apply(lambda x : x.append(df['TWITTER HANDLE'][df['ARTIST NICKNAME'] == x]));

ValueError: ('Lengths must match to compare', (44,), (1,))

In [66]:
df

Unnamed: 0,ARTIST,DATE OF CANCELLATION,GENDER,RACE,GENRE,LEVEL FAME,REASON FOR CANCELLATION,TWITTER HANDLE,TWITTER FOLLOWERS,CHARTMETRIC ID,ARTIST NICKNAME,CANCELLED
0,Nicki Minaj,2021-09-13,Female,Black,Hip-hop,Superstar,Spreading COVID misinformation,@NICKIMINAJ,26000000.0,3442,"[Nicki Minaj, [@NICKIMINAJ, @taylorswift13, @D...",cancelled
1,Taylor Swift,2022-08-01,Female,White,Pop,Superstar,No.1 Private jet usage,@taylorswift13,91000000.0,2762,"[Taylor Swift, [@NICKIMINAJ, @taylorswift13, @...",cancelled
2,DaBaby,2021-07-26,Male,Black,Hip-hop,Superstar,Homophobic remarks/insensitive HIV comments,@DaBabyDaBaby,4500000.0,398544,"[DaBaby´, [@NICKIMINAJ, @taylorswift13, @DaBab...",cancelled
3,Zayn Malik,2021-10-28,Male,South Asian,Pop,Superstar,Accused of physical assault,@zaynmalik,31000000.0,207124,"[Zayn, Zayn Malik, [@NICKIMINAJ, @taylorswift1...",cancelled
4,Doja Cat,2020-05-22,Female,Black,Hip-hop,Superstar,Accused of affiliating with white supremacists,@DojaCat,5400000.0,217671,"[Doja, Doja Cat, [@NICKIMINAJ, @taylorswift13,...",cancelled
5,Marilyn Manson,2021-02-01,Male,White,Rock,Superstar,Relationship abuse & domestic violence,@marilynmanson,1100000.0,2286,"[Marilyn Manson, [@NICKIMINAJ, @taylorswift13,...",cancelled
6,Travis Scott,2021-11-01,Male,Black,Hip-hop,Superstar,Astroworld festival crush,@trvisXX,11600000.0,4215,"[Travis Scott, [@NICKIMINAJ, @taylorswift13, @...",cancelled
7,Chris Brown,2022-01-28,Male,Black,Pop,Superstar,Rape & drug (Crime),@chrisbrown,32200000.0,2249,"[Chris Brown, [@NICKIMINAJ, @taylorswift13, @D...",cancelled
8,Lana Del Rey,2020-05-21,Female,White,Pop,Superstar,Accused of racism,,,3986,"[Lana, Lana Del Rey, [@NICKIMINAJ, @taylorswif...",cancelled
9,Justin Timberlake,2021-02-01,Male,White,Pop,Superstar,Britney Spears and Janet Jackson's confession,@jtimberlake,63000000.0,1450,"[Justin Timberlake, [@NICKIMINAJ, @taylorswift...",cancelled
