In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter

In [2]:

def daily_aggregation(df_tw):
    # Compute sentiment variables aggregated over a day
    df_agg = df_tw.groupby('Date').agg(\
                                        tw_neg_sum = ('tw_neg', 'sum'),\
                                        tw_neg_mean = ('tw_neg', 'mean'),\
                                        tw_neg_std = ('tw_neg', 'std'),\
                                        tw_pos_sum = ('tw_pos', 'sum'),\
                                        tw_pos_mean = ('tw_pos', 'mean'),\
                                        tw_pos_std = ('tw_pos', 'std'),\
                                        tw_comp_sum = ('tw_comp', 'sum'),\
                                        tw_comp_mean = ('tw_comp', 'mean'),\
                                        tw_comp_std = ('tw_comp', 'std'),\
                                        tw_RelDiffPosNeg_sum = ('tw_RelDiffPosNeg', 'sum'),\
                                        tw_RelDiffPosNeg_mean = ('tw_RelDiffPosNeg', 'mean'),\
                                        tw_RelDiffPosNeg_std = ('tw_RelDiffPosNeg', 'std'),\
                                        tw_Like_Count_sum = ('Like_Count', 'sum'),\
                                        tw_Like_Count_mean = ('Like_Count', 'mean'),\
                                        tw_Like_Count_std = ('Like_Count', 'std'),\
                                        tw_Reply_Count_sum = ('Reply_Count', 'sum'),\
                                        tw_Reply_Count_mean = ('Reply_Count', 'mean'),\
                                        tw_Reply_Count_std = ('Reply_Count', 'std'),\
                                        tw_Retweet_Count_sum = ('Retweet_Count', 'sum'),\
                                        tw_Retweet_Count_mean = ('Retweet_Count', 'mean'),\
                                        tw_Retweet_Count_std = ('Retweet_Count', 'std'),\
                                        tw_Quote_Count_sum = ('Quote_Count', 'sum'),\
                                        tw_Quote_Count_mean = ('Quote_Count', 'mean'),\
                                        tw_Quote_Count_std = ('Quote_Count', 'std'),\
                                      )

    # Compute the number of positive (compound>0) and negative (compound<0) rdeets per day

    df_agg['tw_comp_q5_count'] = df_tw.groupby('Date')['tw_comp_quintile'].apply(lambda x: (x==5).sum()).reset_index(name='tw_comp_q5_count')['tw_comp_q5_count'].to_list()
    df_agg['tw_comp_q4_count'] = df_tw.groupby('Date')['tw_comp_quintile'].apply(lambda x: (x==4).sum()).reset_index(name='tw_comp_q4_count')['tw_comp_q4_count'].to_list()
    df_agg['tw_comp_q3_count'] = df_tw.groupby('Date')['tw_comp_quintile'].apply(lambda x: (x==3).sum()).reset_index(name='tw_comp_q3_count')['tw_comp_q3_count'].to_list()
    df_agg['tw_comp_q2_count'] = df_tw.groupby('Date')['tw_comp_quintile'].apply(lambda x: (x==2).sum()).reset_index(name='tw_comp_q2_count')['tw_comp_q2_count'].to_list()
    df_agg['tw_comp_q1_count'] = df_tw.groupby('Date')['tw_comp_quintile'].apply(lambda x: (x==1).sum()).reset_index(name='tw_comp_q1_count')['tw_comp_q1_count'].to_list()

    # Reset index and reformat date
    df_agg = df_agg.reset_index()
    df_agg.Date = pd.to_datetime(df_agg.Date)

    # Compute the number of positive (compound>0) and negative (compound<0) rdeets per day
    df_agg['tw_N'] = df_tw.groupby('Date')['tw_comp'].apply(lambda x: x.count()).reset_index(name='tw_N')['tw_N'].to_list()
    df_agg['tw_N_pos'] = df_tw.groupby('Date')['tw_comp'].apply(lambda x: (x>0).sum()).reset_index(name='tw_N_pos')['tw_N_pos'].to_list()
    df_agg['tw_N_neg'] = df_tw.groupby('Date')['tw_comp'].apply(lambda x: (x<0).sum()).reset_index(name='tw_N_neg')['tw_N_neg'].to_list()

    # Compute most dicussed topic of the day
    df_agg['tw_topic1_freq'] = df_tw.groupby('Date')['tw_topic'].apply(lambda x: Counter(list(x))[0]/len(list(x))).reset_index(name='tw_topic1_freq')['tw_topic1_freq'].to_list()
    df_agg['tw_topic2_freq'] = df_tw.groupby('Date')['tw_topic'].apply(lambda x: Counter(list(x))[1]/len(list(x))).reset_index(name='tw_topic2_freq')['tw_topic2_freq'].to_list()
    df_agg['tw_topic3_freq'] = df_tw.groupby('Date')['tw_topic'].apply(lambda x: Counter(list(x))[2]/len(list(x))).reset_index(name='tw_topic3_freq')['tw_topic3_freq'].to_list()

    # Add influencers
    for influencer in ['MrBeast', 'theestallion', 'elonmusk']:
        df_agg[influencer] = df_tw.groupby('Date')['Username'].apply(lambda x: int(influencer in list(x))).reset_index(name=influencer)[influencer].to_list()

    # Compute rolling window variables
    variables = list(df_agg.filter(like='tw_').filter(like='_sum').columns)
    variables = variables + ['tw_N', 'tw_N_pos', 'tw_N_neg']
    for var in variables:
        df_agg[var+'_last5_mean'] = df_agg[var].rolling(5).mean().reset_index(name=var+'_last5_mean')[var+'_last5_mean'].to_list()
        df_agg[var+'_last5_std'] = df_agg[var].rolling(5).std().reset_index(name=var+'_last5_std')[var+'_last5_std'].to_list()
        df_agg[var+'_last10_mean'] = df_agg[var].rolling(10).mean().reset_index(name=var+'_last10_mean')[var+'_last10_mean'].to_list()
        df_agg[var+'_last10_std'] = df_agg[var].rolling(10).std().reset_index(name=var+'_last10_std')[var+'_last10_std'].to_list()
        
    # Fill missing values
    for col in [c for c in df_agg.columns if c.endswith('std')]:
        df_agg[col].fillna(0, inplace=True)

    # Remove rows with missing values
    df_agg.dropna(inplace=True)
    
    return df_agg

In [3]:
# Read the raw data frame
df_tw = pd.read_csv('df_tw_topic.csv')

# Perform aggregation
df_agg = daily_aggregation(df_tw)

# Store the dataframe
df_agg.to_csv('df_tw_agg.csv', index=False)

In [4]:
df_agg

Unnamed: 0,Date,tw_neg_sum,tw_neg_mean,tw_neg_std,tw_pos_sum,tw_pos_mean,tw_pos_std,tw_comp_sum,tw_comp_mean,tw_comp_std,...,tw_N_last10_mean,tw_N_last10_std,tw_N_pos_last5_mean,tw_N_pos_last5_std,tw_N_pos_last10_mean,tw_N_pos_last10_std,tw_N_neg_last5_mean,tw_N_neg_last5_std,tw_N_neg_last10_mean,tw_N_neg_last10_std
9,2020-04-10,2.912,0.080889,0.098560,4.720,0.131111,0.107877,5.2206,0.145017,0.523041,...,47.6,9.856752,32.0,6.670832,31.2,7.067924,16.8,3.563706,16.4,4.926121
10,2020-04-11,2.918,0.100621,0.093243,3.338,0.115103,0.102631,2.4125,0.083190,0.534318,...,45.0,11.045361,28.6,9.235800,28.6,7.026932,15.2,3.563706,16.4,4.926121
11,2020-04-12,3.263,0.125500,0.102902,2.268,0.087231,0.101019,0.2637,0.010142,0.543598,...,41.6,11.147496,23.4,9.235800,26.5,8.263037,14.0,2.915476,15.1,3.665151
12,2020-04-13,3.392,0.067840,0.082343,6.020,0.120400,0.104220,6.0295,0.120590,0.527863,...,42.1,11.425605,23.8,9.576012,27.0,8.379870,15.4,3.361547,15.1,3.665151
13,2020-04-14,1.435,0.038784,0.071122,5.517,0.149108,0.076945,15.4800,0.418378,0.371249,...,42.4,11.226952,23.0,8.396428,27.9,8.385835,12.6,5.029911,14.5,4.743416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2021-03-10,22.571,0.077034,0.102690,41.792,0.142635,0.113259,56.8957,0.194183,0.490530,...,243.3,49.225219,173.4,46.247162,171.4,35.189013,69.0,17.507141,71.9,16.508920
344,2021-03-11,13.141,0.062280,0.089620,34.935,0.165569,0.119601,55.6715,0.263846,0.468868,...,233.2,43.603771,182.4,33.087762,166.0,33.025243,70.2,16.115210,67.2,12.576875
345,2021-03-12,19.078,0.070140,0.102356,38.506,0.141566,0.117332,51.4358,0.189102,0.483942,...,236.7,45.313844,192.4,23.125743,167.7,33.852786,75.8,13.773162,69.0,13.333333
346,2021-03-13,37.238,0.080777,0.112428,73.512,0.159462,0.129344,95.0547,0.206192,0.501920,...,258.2,84.381146,213.2,57.456070,181.4,55.887188,92.8,36.099861,76.8,29.835847


# Backup

In [5]:
df_tw.groupby('Date')['Username'].apply(lambda x: int('elonmusk' in list(x))).sum()

6

In [6]:
df_tw.groupby('Date')['Username'].apply(list)

Date
2020-04-01    [APompliano, BTC_JackSparrow, yassineARK, Cred...
2020-04-02    [mjdsouza2, jamaljsr, RookieXBT, Schuldensuehn...
2020-04-03    [CamiRusso, BVBTC, LiquidityB, DTAPCAP, voiceo...
2020-04-04    [PsychedelicBart, HsakaTrades, LomahCrypto, al...
2020-04-05    [TylerDurden, rogerkver, CryptoCred, verysmall...
                                    ...                        
2021-03-10    [iamjosephyoung, sunchoi69, AlexSaundersAU, ia...
2021-03-11    [ericbolling, VentureCoinist, MMCrypto, hasufl...
2021-03-12    [DavidHundeyin, MrOdanz, TimDraper, markjeffre...
2021-03-13    [alifarhat79, loopringorg, 1MarkMoss, AltcoinD...
2021-03-14    [fillbeforeshill, BTC_Archive, Glimmerycoin, g...
Name: Username, Length: 348, dtype: object

In [7]:
df_tw_raw.columns

NameError: name 'df_tw_raw' is not defined

In [None]:
df_tw.groupby('Username').Retweet_Count.agg('sum').sort_values(ascending=False).head(10)

In [None]:
df_tw.groupby('Username').Retweet_Count.agg('mean').sort_values(ascending=False).head(10)

In [None]:
df_tw.groupby('Username').Reply_Count.agg('sum').sort_values(ascending=False).head(10)