## Requirements

In [1]:
import os
from os import mkdir
from genericpath import exists
from tqdm import tqdm
from time import sleep

import pandas as pd
import numpy as np
import tweepy
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from TwitterApi.tweet_api import TweetCrawlerAcademic


In [2]:
pd.option_context('display.float_format',  float)


<pandas._config.config.option_context at 0x220902ea4f0>

In [3]:
# constant variables
START_TIME = "2019-01-01T00:00:00Z"
END_TIME = "2022-07-01T00:00:00Z"

## Crawl Twitter

In [4]:
def set_output_name(string):
    name = string.replace(" ", "_")
    return name.replace(":", "")

def edit_dict_key_names(dictionary, dicname):
    l = dict()
    for key in dictionary.keys():
        newkey = f'{dicname}_{key}'
        l[newkey] = dictionary[key]
    return l

def retrieve_dataframes(store_path, page):
    users = pd.read_csv(f'{store_path}/{page}/users.csv')
    tweets = pd.read_csv(f'{store_path}/{page}/tweets.csv')
    references = pd.read_csv(f'{store_path}/{page}/refrences.csv', index_col='id')
    return users, tweets, references

In [5]:
client = TweetCrawlerAcademic("API.JSON", "crawler.log")

In [6]:
query = "I apologize harassed is:quote -is:retweet lang:en"
filename = set_output_name(query)
print(filename)
store_path = f'data/{filename}'

I_apologize_harassed_isquote_-isretweet_langen


In [7]:
client.get_all_tweets(
    query=query,
    start_time=START_TIME,
    end_time=END_TIME,
    store_path=store_path
)

2it [00:19,  9.78s/it]


In [8]:
tw = pd.read_csv(f'{store_path}/page1/tweets.csv')
tw = tw[tw["public_metrics_reply_count"] >= 5]
tw.head()

Unnamed: 0,author_id,created_at,id,lang,public_metrics_like_count,public_metrics_quote_count,public_metrics_reply_count,public_metrics_retweet_count,quoted_id,replied_to_id,text
13,1400893210264707072,2022-06-04 23:55:39+00:00,1533236067746209792,en,201,0,5,33,,,"""I'm still gonna do this to people"" \n\nyou me..."
29,1271479172989218818,2022-04-15 19:24:12+00:00,1515048360687779841,en,253,3,6,20,1.515043e+18,,"All what Dream has to do: ""I'm sorry to Bitzel..."
38,1349196499708596225,2022-03-21 16:05:19+00:00,1505938617054556163,en,74,0,13,0,1.505934e+18,,want to apologize for putting this shit on you...
51,753984763115950081,2022-02-09 21:27:10+00:00,1491524097620070400,en,141,1,5,32,1.491503e+18,,It's very telling that this letter doesn't act...
98,149913262,2021-09-06 00:18:30+00:00,1434672318786179072,en,213,0,19,5,,,"On behalf of all Hawkeye fans, I apologize for..."


## Get Replies

In [23]:
def get_all_replies(client, path, conversation_id):
    conv_query = f'conversation_id:{conversation_id} -is:retweet'
    
    out = pd.DataFrame()
    for i, respond in tqdm(enumerate(tweepy.Paginator(client.client.search_all_tweets,
                                                      query=conv_query,
                                                      max_results=500,
                                                      start_time=START_TIME,
                                                      end_time=END_TIME))):
        if respond.data is None:
            print("respond.data is none")
            continue
        out = out.append(pd.DataFrame(client.__get_respond_dfs__(respond)['tweets']), ignore_index=True)
        sleep(1)
    if not out.empty:
        if len(out.index) > 0:
            directory = f'{path}/{conversation_id}'
            if not os.path.exists(directory):
                mkdir(directory)
            out.to_excel(f'{directory}/replies.xlsx', index=False)
            print(f'File saved at {directory}/replies.txt')
        else:
            print(f'{out} size is < 5')
    else:
        print(f'len(out.index) is {len(out.index)}')
    return out                     

In [10]:
get_all_replies(client, 'data', 1515048360687779841)

1it [00:01,  1.65s/it]


File saved at data/1515048360687779841/replies.txt


Unnamed: 0,id,text
0,1515069893657432065,@MLGflower @OfflinePepsi @RazaAkira Hello dear...
1,1515060182757249030,@RazaAkira Here’s the apology \n\nhttps://t.co...
2,1515055448768094211,@RazaAkira why would he have to apologize to b...
3,1515055009506992134,@MLGflower @BlkHwk0ps @OfflinePepsi @RazaAkira...
4,1515054753876951040,@OfflinePepsi @RazaAkira GrantCohn better + Ra...
5,1515054483528691725,@RazaAkira Why Bitzel?
6,1515053804357574660,@MLGflower @RazaAkira GrantCohn better + Ratio...
7,1515051527538241538,@RazaAkira 2 priv qrts LOL
8,1515050338474409984,@RazaAkira bitzel is harrassing dream and an a...
9,1515050226997862401,You can private quote tweet me all you want. W...


## Get Self-quote Tweets

In [11]:
# def get_sq_tweets_of_page(store_path, page):    

#     self_quote_tweets = pd.DataFrame()
#     _, tweets, references = retrieve_dataframes(store_path, page)
#     for t_id, q_id in tweets[['author_id', 'quoted_id']].values:
#         quoted_tweets = references.loc[q_id]
#         if len(quoted_tweets.index) == 0:
#             continue
#         quoted_tweet = quoted_tweets.iloc[0]
#         if (quoted_tweet.author_id == t_id):
#             dict1 = edit_dict_key_names(dict(tweets.iloc[tweet]), 'tweets')
#             dict2 = edit_dict_key_names(dict(quoted_tweet), 'references')
#             merged_dict = {**dict1, **dict2}
#             self_quote_tweets = self_quote_tweets.append(merged_dict, ignore_index=True)

#     print(self_quote_tweets.shape)
#     return self_quote_tweets



def get_sq_tweets_of_page(store_path, page):    

    # self_quote_tweets = pd.DataFrame()
    _, tweets, references = retrieve_dataframes(store_path, page)
    tweets.dropna(subset=['quoted_id'], inplace=True)
    # print(tweets)
    df = pd.merge(
        tweets[tweets.columns],
        references[references.columns],
        left_on='quoted_id',
        right_index=True,
        suffixes=('_original', '_reference')
    )
    # df["quoted_id"] = df["quoted_id"].astype(int)
    print(df["quoted_id"].dtype)
    return df, df[df['author_id_original'] == df['author_id_reference']].sort_values('id')

In [12]:
total, result = get_sq_tweets_of_page(store_path, 'page1')
result

float64


Unnamed: 0,author_id_original,created_at_original,id,lang_original,public_metrics_like_count_original,public_metrics_quote_count_original,public_metrics_reply_count_original,public_metrics_retweet_count_original,quoted_id,replied_to_id,text_original,author_id_reference,created_at_reference,lang_reference,public_metrics_like_count_reference,public_metrics_quote_count_reference,public_metrics_reply_count_reference,public_metrics_retweet_count_reference,text_reference
366,406132851,2019-05-24 10:26:22+00:00,1131869040006221824,en,11,2,5,0,1.131562e+18,,Hi just want to clarify this CC question. This...,406132851,2019-05-23 14:07:01+00:00,tl,15,1,3,0,tara momol sabay usap kung bakit di pa tayo na...
345,90906432,2019-08-05 03:53:59+00:00,1158224606131625984,en,1,0,0,0,1.158208e+18,,I apologize to @Rialisms @RonToye @marchimark ...,90906432,2019-08-05 02:49:56+00:00,en,3,1,1,0,Ok I’m sorry for ranting but here it goes &amp...
333,1011662484913246213,2019-09-24 17:14:17+00:00,1176545401576382464,en,0,0,0,0,1.17639e+18,,@ThomasSanders i've been harassed several time...,1011662484913246213,2019-09-24 06:55:04+00:00,en,0,1,0,0,@ThomasSanders can u please confirm or deny wh...
281,59731401,2020-06-01 09:57:17+00:00,1267394780272582657,en,103,0,5,19,1.267376e+18,,In the interest of transparency (because let's...,59731401,2020-06-01 08:41:44+00:00,en,825,9,16,284,Police officers start placing a black man into...
262,881192533056827392,2020-06-29 20:11:38+00:00,1277696249097883650,en,68,0,8,3,1.277695e+18,1.277695e+18,fuck it\n\n@MikeZSez is being harassed for bei...,881192533056827392,2020-06-29 20:07:56+00:00,en,34,1,2,4,@trisleonidas https://t.co/O1hV4pFs2T\n\nme: i...
220,343109379,2020-09-05 02:35:06+00:00,1302072735548481537,en,1,0,0,1,1.302071e+18,,Obviously I would’ve preferred a more ecologic...,343109379,2020-09-05 02:30:08+00:00,en,0,1,0,0,Then I got this answer to my tweet “Hello Hect...
200,1285787885942329346,2020-12-05 10:25:36+00:00,1335168439225298944,en,1,0,0,0,1.335168e+18,,“hey don’t say i like killing children that’s ...,1285787885942329346,2020-12-05 10:24:41+00:00,en,0,1,0,0,harassing someone for their religion and ethni...
33,1378503060893278212,2022-04-09 19:59:55+00:00,1512883025234718720,en,5,1,0,0,1.512652e+18,,If you're upset at me about the fey/raven situ...,1378503060893278212,2022-04-09 04:40:06+00:00,en,19,1,2,0,I suppose I should address the whole raven sit...


In [13]:
# _, t, r = retrieve_dataframes(store_path, 'page1')
# t = t[t["public_metrics_reply_count"] >= 5]
# t.sort_values('quoted_id')

In [24]:
from TwitterApi.tweet_api import fix_column


for page in tqdm(os.listdir(store_path)):
    _, self_quote_tweets = get_sq_tweets_of_page(store_path, page)
    
    sq_path = f'{store_path}/{page}'
    replies_path = f'{sq_path}/replies'
    if not os.path.exists(replies_path):
            mkdir(replies_path)

    self_quote_tweets = fix_column(self_quote_tweets)
    
    self_quote_tweets.to_excel(f'{sq_path}/self_quote_tweets.xlsx', index=False)
    print(f'Extracted sq tweets of {page}')

    conversation_ids = self_quote_tweets['quoted_id'].tolist()    
    for conv_id in conversation_ids:
        # print(int(conv_id))
        # continue
        reps = get_all_replies(client, replies_path, int(conv_id))
        print(f'Retrieved replies of {int(conv_id)} with shape {reps.shape}')
        sleep(2)

  0%|          | 0/2 [00:00<?, ?it/s]

float64
Extracted sq tweets of page1


1it [00:01,  1.54s/it]


File saved at data/I_apologize_harassed_isquote_-isretweet_langen/page1/replies/1131562176773275648/replies.txt
Retrieved replies of 1131562176773275648 with shape (5, 2)


1it [00:01,  1.45s/it]


File saved at data/I_apologize_harassed_isquote_-isretweet_langen/page1/replies/1158208486599200768/replies.txt
Retrieved replies of 1158208486599200768 with shape (5, 2)


1it [00:01,  1.35s/it]

respond.data is none
len(out.index) is 0
Retrieved replies of 1176389571543625728 with shape (0, 0)



1it [00:01,  1.62s/it]


File saved at data/I_apologize_harassed_isquote_-isretweet_langen/page1/replies/1267375766129438720/replies.txt
Retrieved replies of 1267375766129438720 with shape (46, 2)


1it [00:00,  2.83it/s]

respond.data is none
len(out.index) is 0
Retrieved replies of 1277695316716736512 with shape (0, 0)



1it [00:00,  2.98it/s]

respond.data is none
len(out.index) is 0
Retrieved replies of 1302071487084593152 with shape (0, 0)



1it [00:01,  1.15s/it]

respond.data is none
len(out.index) is 0
Retrieved replies of 1335168208119230464 with shape (0, 0)



1it [00:01,  1.41s/it]


File saved at data/I_apologize_harassed_isquote_-isretweet_langen/page1/replies/1512651545271103488/replies.txt
Retrieved replies of 1512651545271103488 with shape (13, 2)


100%|██████████| 2/2 [00:25<00:00, 12.84s/it]

int64
Extracted sq tweets of page2





In [16]:
t = 1.1315621767732756e+18
type(t)

float

## Get Other Tweets (in specific time)

In [17]:
# Constant variables
PREV_DAYS = 3
POST_DAYS = 3

In [18]:
def subtract_dates(date, number):
    return 

def get_prev_and_post_days(tweet):
    created_at = tweet["created_at"]
    date = str(created_at.split()[0])
    prev_time = f'{subtract_dates(date, 3)}T00:00:00Z'
    post_time = f'{subtract_dates(date, -3)}T00:00:00Z'
    return prev_time, post_time

In [19]:
for tweet in tweets:
    prev_day, post_day = get_prev_and_post_days(tweet)
    
    query = f"from:{tweet[]}"
    client.get_all_tweets(
        query=query,
        start_time=START_TIME,
        end_time=END_TIME,
        store_path=store_path
    )    



SyntaxError: invalid syntax (<fstring>, line 1)

## Sentiment Analysis

In [None]:
replies = pd.read_excel('data/I_apologize_harassed_isquote_-isretweet_langen/page1/1512651545271103488/replies.xlsx')
replies.shape

(13, 2)

In [None]:
pd.set_option('display.float_format',  '{:,.2f}'.format)

In [None]:
replies = replies.assign(sentiment=None)
sentences = replies['text'].tolist()
for i, sentence in enumerate(sentences):
    res = TextBlob(sentence)
    mood = res.sentiment.polarity
    replies.loc[replies['text'] == sentence, 'sentiment'] = mood

In [None]:
replies.to_csv('testtt.csv')

In [None]:
sid_obj = SentimentIntensityAnalyzer()