In [6]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
# Sentiment computation
def get_sentiment(df_raw, query):

    sa = SentimentIntensityAnalyzer()

    # Apply a selection
    df = df_raw.query(query).copy(deep=True)

    df['rd_sentiments'] = df.Title.apply(lambda x: sa.polarity_scores(x))
    df['rd_neg'] = df.rd_sentiments.apply(lambda x: x['neg'])
    df['rd_pos'] = df.rd_sentiments.apply(lambda x: x['pos'])
    df['rd_neu'] = df.rd_sentiments.apply(lambda x: x['neu'])
    df['rd_comp'] = df.rd_sentiments.apply(lambda x: x['compound'])

    # Apply selection
    df = df.query('rd_comp != 0')

    # Compute extra vars
    df['rd_RelDiffPosNeg'] = (df.rd_pos-df.rd_neg)/(df.rd_pos+df.rd_neg)
    df['rd_comp_quintile'] = pd.qcut(df['rd_comp'],q=[0, .2, .4, .6, .8, 1],labels=[1, 2, 3, 4, 5])
    
    return df

In [2]:
# Read the raw data frame
#df_rd_raw = pd.read_csv('df_rd_row.csv')

# Apply sentiment analysis and selection
df_rd = get_sentiment(df_rd_raw, 'Ups > 100')

# Store the dataframe
df_rd.to_csv('df_rd_sentiment.csv', index=False)

In [11]:
df_rd.to_csv('df_rd_sentiment.csv', index=False)

In [9]:
df_rd

Unnamed: 0,Author,Title,Selftext,Created Utc,Ups,Upvote Ratio,Num Comments,Total Awards,Date,rd_sentiments,rd_neg,rd_pos,rd_neu,rd_comp,rd_RelDiffPosNeg,rd_comp_quintile
2,dazaraf,The US dollar has lost 23% of its purchasing p...,,1.616084e+09,1549,0.98,150,7,2021-03-18,"{'neg': 0.046, 'neu': 0.908, 'pos': 0.045, 'co...",0.046,0.045,0.908,-0.0164,-0.010989,2
3,ConalR,"Help please, 20BCH stuck in limbo on local.bit...",I'll start by sending my regards to the team a...,1.616102e+09,116,0.94,60,0,2021-03-18,"{'neg': 0.167, 'neu': 0.417, 'pos': 0.417, 'co...",0.167,0.417,0.417,0.4588,0.428082,4
5,Sanchik_Ponchik,"First JP Morgan talks shit about BTC, and then...",,1.616084e+09,4555,0.97,271,18,2021-03-18,"{'neg': 0.216, 'neu': 0.699, 'pos': 0.084, 'co...",0.216,0.084,0.699,-0.6081,-0.440000,1
6,atrueretard,Satoshi Forest - some rich bitcoiner bought 8 ...,,1.615949e+09,3581,0.98,232,35,2021-03-17,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...",0.000,0.231,0.769,0.5574,1.000000,4
7,vondoe666,Morgan Stanley becomes the first big U.S. bank...,,1.615994e+09,3861,0.97,352,11,2021-03-17,"{'neg': 0.0, 'neu': 0.865, 'pos': 0.135, 'comp...",0.000,0.135,0.865,0.3612,1.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1983,adoptablockchain,Stores accepting Bitcoin. Some accept other cr...,,1.584881e+09,1394,0.93,226,1,2020-03-22,"{'neg': 0.0, 'neu': 0.451, 'pos': 0.549, 'comp...",0.000,0.549,0.451,0.8188,1.000000,5
1984,scotty321,How USA government works: $1 trillion bailout ...,,1.584887e+09,335,0.95,51,0,2020-03-22,"{'neg': 0.154, 'neu': 0.79, 'pos': 0.057, 'com...",0.154,0.057,0.790,-0.4199,-0.459716,1
1991,333929,Reject the anti-encrytpion bill. They are tryi...,,1.584665e+09,976,0.99,70,1,2020-03-20,"{'neg': 0.252, 'neu': 0.652, 'pos': 0.096, 'co...",0.252,0.096,0.652,-0.6723,-0.448276,1
1993,265,Reject the anti-encrytpion bill. They are tryi...,,1.584647e+09,375,0.98,13,0,2020-03-19,"{'neg': 0.252, 'neu': 0.652, 'pos': 0.096, 'co...",0.252,0.096,0.652,-0.6723,-0.448276,1


In [5]:
df_rd = get_sentiment(df_rd_raw, 'Ups > 100')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rd_RelDiffPosNeg'] = (df.rd_pos-df.rd_neg)/(df.rd_pos+df.rd_neg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rd_comp_quintile'] = pd.qcut(df['rd_comp'],q=[0, .2, .4, .6, .8, 1],labels=[1, 2, 3, 4, 5])


In [None]:
def daily_aggregation(df_rd):
    # Compute sentiment variables aggregated over a day
    df_agg = df_rd.groupby('Date').agg(\
                                        rd_neg_sum = ('rd_neg', 'sum'),\
                                        rd_neg_mean = ('rd_neg', 'mean'),\
                                        rd_neg_std = ('rd_neg', 'std'),\
                                        rd_pos_sum = ('rd_pos', 'sum'),\
                                        rd_pos_mean = ('rd_pos', 'mean'),\
                                        rd_pos_std = ('rd_pos', 'std'),\
                                        rd_comp_sum = ('rd_comp', 'sum'),\
                                        rd_comp_mean = ('rd_comp', 'mean'),\
                                        rd_comp_std = ('rd_comp', 'std'),\
                                        rd_RelDiffPosNeg_sum = ('rd_RelDiffPosNeg', 'sum'),\
                                        rd_RelDiffPosNeg_mean = ('rd_RelDiffPosNeg', 'mean'),\
                                        rd_RelDiffPosNeg_std = ('rd_RelDiffPosNeg', 'std'),\
                                        rd_Ups_sum = ('Ups', 'sum'),\
                                        rd_Ups_mean = ('Ups', 'mean'),\
                                        rd_Ups_std = ('Ups', 'std'),\
                                        rd_Upvote_Ratio_sum = ('Upvote Ratio', 'sum'),\
                                        rd_Upvote_Ratio_mean = ('Upvote Ratio', 'mean'),\
                                        rd_Upvote_Ratio_std = ('Upvote Ratio', 'std'),\
                                        rd_Num_Comments_sum = ('Num Comments', 'sum'),\
                                        rd_Num_Comments_mean = ('Num Comments', 'mean'),\
                                        rd_Num_Comments_std = ('Num Comments', 'std'),\
                                        rd_Total_Awards_sum = ('Total Awards', 'sum'),\
                                        rd_Total_Awards_mean = ('Total Awards', 'mean'),\
                                        rd_Total_Awards_std = ('Total Awards', 'std'),\
                                      )

    # Compute the number of positive (compound>0) and negative (compound<0) rdeets per day

    df_agg['rd_comp_q5_count'] = df_rd.groupby('Date')['rd_comp_quintile'].apply(lambda x: (x==5).sum()).reset_index(name='rd_comp_q5_count')['rd_comp_q5_count'].to_list()
    df_agg['rd_comp_q4_count'] = df_rd.groupby('Date')['rd_comp_quintile'].apply(lambda x: (x==4).sum()).reset_index(name='rd_comp_q4_count')['rd_comp_q4_count'].to_list()
    df_agg['rd_comp_q3_count'] = df_rd.groupby('Date')['rd_comp_quintile'].apply(lambda x: (x==3).sum()).reset_index(name='rd_comp_q3_count')['rd_comp_q3_count'].to_list()
    df_agg['rd_comp_q2_count'] = df_rd.groupby('Date')['rd_comp_quintile'].apply(lambda x: (x==2).sum()).reset_index(name='rd_comp_q2_count')['rd_comp_q2_count'].to_list()
    df_agg['rd_comp_q1_count'] = df_rd.groupby('Date')['rd_comp_quintile'].apply(lambda x: (x==1).sum()).reset_index(name='rd_comp_q1_count')['rd_comp_q1_count'].to_list()

    # Reset index and reformat date
    df_agg = df_agg.reset_index()
    df_agg.Date = pd.to_datetime(df_agg.Date)

    # Compute the number of positive (compound>0) and negative (compound<0) rdeets per day
    df_agg['rd_N'] = df_rd.groupby('Date')['rd_comp'].apply(lambda x: x.count()).reset_index(name='rd_N')['rd_N'].to_list()
    df_agg['rd_N_pos'] = df_rd.groupby('Date')['rd_comp'].apply(lambda x: (x>0).sum()).reset_index(name='rd_N_pos')['rd_N_pos'].to_list()
    df_agg['rd_N_neg'] = df_rd.groupby('Date')['rd_comp'].apply(lambda x: (x<0).sum()).reset_index(name='rd_N_neg')['rd_N_neg'].to_list()

    # Add influencers
    for influencer in ['AlphaGrayWolf', 'Secret_Operative', 'npjprods']:
        df_agg[influencer] = df_rd.groupby('Date')['Author'].apply(lambda x: int(influencer in list(x))).reset_index(name=influencer)[influencer].to_list()

    # Fill missing values
    for col in [c for c in df_agg.columns if c.endswith('std')]:
        df_agg[col].fillna(0, inplace=True)

    return df_agg

In [None]:
df_agg = daily_aggregation(df_rd)

In [None]:
df_agg

In [None]:
df_agg.npjprods.sum()

In [None]:
# Store the dataframe
df_agg.to_csv('df_rd_agg.csv', index=False)

# Some analytics

In [None]:
df_rd.groupby('Date').Author.apply(lambda x: int('AlphaGrayWolf' in x))

In [None]:
df_rd.columns

In [None]:
AlphaGrayWolf
Secret_Operative
Random1DollarTip

In [None]:
df_rd.groupby('Author').Ups.agg('mean').sort_values(ascending=False).head(50)

In [None]:
df_rd.groupby('Author')['Total Awards'].agg('sum').sort_values(ascending=False).head(50)

In [None]:
df_rd_raw.query('Author == "AlphaGrayWolf"')

In [None]:
df_rd_raw[df_rd_raw.Author == '[deleted]']

In [None]:
ax = df_rd_raw['Upvote Ratio'].hist(alpha=0.3)
ax.set_yscale('log')

In [None]:
ax = df_rd_raw.Ups.hist(bins=100, alpha=0.3)
ax.set_yscale('log')

In [None]:
ax = df_rd_raw.query('Ups > 100').Ups.hist(bins=100, alpha=0.3)
ax.set_yscale('log')

In [None]:
ax = df_rd.rd_comp.hist(bins=100, alpha=0.3)
ax.set_yscale('log')

In [None]:
df_rd.query('rd_comp == 0').shape[0]/df_rd.shape[0]

In [None]:
df_rd.query('rd_comp != 0').shape[0]

In [None]:
ax = df_rd['rd_comp'].hist(bins=100, alpha=0.3)
ax.set_yscale('log')