# Loading Data

In [1]:
import pandas as pd
import glob
import pandas as pd
import ast
from tqdm import tqdm

df = pd.read_csv('/Users/sismetanin/Downloads/concated_twitter_v3.csv.gz', encoding='UTF-8', 
                 lineterminator='\n', compression='gzip')

In [2]:
df = df.drop_duplicates(subset=['id_str'])
df = df.dropna(subset=['user_name', 'label'])

# Classifying Gender

In [4]:
twitter_names = [user_name.lower().strip() for user_name in tqdm(df.drop_duplicates(subset=['user_id_str'])['user_name'])]

100%|██████████| 2815374/2815374 [00:01<00:00, 1804334.58it/s]


In [5]:
from joblib import dump, load

clf_gender = load('VkGenderLogit.joblib') 

In [6]:
import numpy as np

chunks = [name.lower() for name in tqdm(df['user_name'])]
chunks = list(set(chunks))
chunks = np.array_split(chunks, 200)

100%|██████████| 17515729/17515729 [00:11<00:00, 1462871.80it/s]


In [7]:
name_to_gender = {}

for chunk in tqdm(chunks):
    genders = clf_gender.predict(chunk)
    for i in range(len(genders)):
        name_to_gender[chunk[i]]=genders[i]

100%|██████████| 200/200 [02:50<00:00,  1.17it/s]


In [8]:
def get_gender(name):
    name = name.lower()
    if name_to_gender[name]==1:
        return 'F'
    else:
        return 'M'

df['gender'] = [get_gender(name) for name in tqdm(df['user_name'])]

100%|██████████| 17515729/17515729 [00:33<00:00, 518625.41it/s]


In [10]:
print('Total tweets', len(df))
print('Total users', len(df['user_id_str'].unique()))
print('Tweets per user', len(df)/len(df['user_id_str'].unique()))

Total tweets 17515729
Total users 2815374
Tweets per user 6.221457255767795


In [11]:
df.to_csv('tweets-ru-oswb-labeled.csv.gz', compression='gzip')

# Preprocessing Data

In [12]:
# mapping sentiment labels to interpretable classes
mapper = {
    'LABEL_0': 'negative', 
    'LABEL_1': 'neutral', 
    'LABEL_2': 'positive', 
    'LABEL_3': 'skip', 
    'LABEL_4': 'speech'
}

df['sentiment'] = [mapper[label] for label in df['label']]

In [15]:
df['date'] = pd.to_datetime(df['created_at'])
dates = df['date'].dt.to_pydatetime()

In [16]:
df['ymd'] = [date.strftime('%Y-%m-%d') for date in tqdm(dates)]

100%|██████████| 17515729/17515729 [01:18<00:00, 221805.37it/s]


In [17]:
df['ym'] = [date.strftime('%Y-%m') for date in tqdm(dates)]

100%|██████████| 17515729/17515729 [01:13<00:00, 239806.41it/s]


# Constructing Indices

In [24]:
import pandas as pd

df_vciom = pd.read_excel('schastie.xls', index_col=0, skiprows=1)

def get_vciom_months():
    return sorted([date.strftime('%Y-%m') for date in df_vciom.iloc[5][1:].keys()])

def get_vciom_happiness_index(months):
    vciom_index = df_vciom.iloc[5][1:].values
    vciom_index_months = [date.strftime('%Y-%m') for date in df_vciom.iloc[5][1:].keys()]
    mapping = {}
    
    
    for i in range(len(vciom_index_months)):
        mapping[vciom_index_months[i]]=vciom_index[i]
    return [mapping[month] for month in months]


def get_vciom_positive_affect_index(months):
    vciom_index = (df_vciom.iloc[0][1:].values+df_vciom.iloc[1][1:].values)/(df_vciom.iloc[0][1:].values
                                                                             +df_vciom.iloc[1][1:].values
                                                                             +df_vciom.iloc[2][1:].values
                                                                             +df_vciom.iloc[3][1:].values
                                                                             +df_vciom.iloc[4][1:].values)
    
    
    vciom_index_months = [date.strftime('%Y-%m') for date in df_vciom.iloc[5][1:].keys()]
    mapping = {}
    for i in range(len(vciom_index_months)):
        mapping[vciom_index_months[i]]=vciom_index[i]
    return [mapping[month] for month in months]

def get_vciom_negative_affect_index(months):
    vciom_index = (df_vciom.iloc[2][1:].values+df_vciom.iloc[3][1:].values)/(df_vciom.iloc[0][1:].values
                                                                             +df_vciom.iloc[1][1:].values
                                                                             +df_vciom.iloc[2][1:].values
                                                                             +df_vciom.iloc[3][1:].values
                                                                             +df_vciom.iloc[4][1:].values)
    
    vciom_index_months = [date.strftime('%Y-%m') for date in df_vciom.iloc[5][1:].keys()]
    mapping = {}
    for i in range(len(vciom_index_months)):
        mapping[vciom_index_months[i]]=vciom_index[i]
    return [mapping[month] for month in months]

In [85]:
overlapping_months = [
    '2014-04',
    '2015-11',
    '2016-04',
    '2016-11',
    '2017-07',
    '2018-03',
    '2018-07',
    '2019-11',
    '2020-04',
    '2020-05',
    '2020-06',
    '2020-07',
    '2020-08',
    '2020-09',
    '2020-10',
    '2020-11',
    '2020-12',
    '2021-03',
    '2021-04',
    '2021-05'
]

In [86]:
vciom_index = get_vciom_happiness_index(overlapping_months)
vciom_pa_index = get_vciom_positive_affect_index(overlapping_months)
vciom_na_index = get_vciom_negative_affect_index(overlapping_months)

In [87]:
from collections import defaultdict, Counter
from datetime import date
from calendar import monthrange
import numpy as np

def aggregated_sentiment(x):
    ordered = Counter(x).most_common()
    if len(ordered) == 1:
        return ordered[0][0]
    elif ordered[0][1]>ordered[1][1]:
        return ordered[0][0]
    return np.nan

    
def aggregate_sentiment_per_user(temp_df, gender=None):
    if gender is not None:
        temp_df = temp_df[temp_df['gender']==gender]
    temp_df = temp_df[temp_df['truncated']==False]
    result = temp_df.groupby(['user_id_str'])['sentiment'].agg([aggregated_sentiment])
    print('before agg', len(temp_df), 'after agg', len(result))
    user_to_sentiment = {}
    for i in range(len(result)):
        user_to_sentiment[result.index[i]] = result['aggregated_sentiment'].values[i]         
    temp_df['sentiment'] = [user_to_sentiment[userId] for userId in temp_df['user_id_str'].values]
    print('before drop', len(temp_df))
    temp_df = temp_df.dropna(subset=['sentiment'])
    print('after sentiment drop', len(temp_df))
    temp_df = temp_df.drop_duplicates(subset=['user_id_str'])
    print('after duplicates drop', len(temp_df))
    return temp_df

positive_count_m = []
negative_count_m = []
neutral_count_m = []
overall_count_m = []
speech_count_m = []
skip_count_m = []

positive_count_f = []
negative_count_f = []
neutral_count_f = []
overall_count_f = []
speech_count_f = []
skip_count_f = []

tweets_total = []
users = []
for ymd in tqdm(overlapping_months):
    temp = df[df['ym']==ymd]
    tweets_total.append(len(temp))
    users.append(df['user_id_str'].unique())
    temp = aggregate_sentiment_per_user(temp, gender='M')
    positive_count_m.append(len(temp[temp['sentiment']=='positive']))
    negative_count_m.append(len(temp[temp['sentiment']=='negative']))
    neutral_count_m.append(len(temp[temp['sentiment']=='neutral']))
    speech_count_m.append(len(temp[temp['sentiment']=='speech']))
    skip_count_m.append(len(temp[temp['sentiment']=='skip']))
    overall_count_m.append(len(temp))
    
    temp = df[df['ym']==ymd]
    temp = aggregate_sentiment_per_user(temp, gender='F')
    positive_count_f.append(len(temp[temp['sentiment']=='positive']))
    negative_count_f.append(len(temp[temp['sentiment']=='negative']))
    neutral_count_f.append(len(temp[temp['sentiment']=='neutral']))
    speech_count_f.append(len(temp[temp['sentiment']=='speech']))
    skip_count_f.append(len(temp[temp['sentiment']=='skip']))
    overall_count_f.append(len(temp))

  0%|          | 0/20 [00:00<?, ?it/s]

before agg 1196653 after agg 377281
before drop 1196653
after sentiment drop 1100659
after duplicates drop 343159
before agg 831308 after agg 292331
before drop 831308
after sentiment drop 742831
after duplicates drop 258791


  5%|▌         | 1/20 [00:21<06:47, 21.47s/it]

before agg 478242 after agg 156978
before drop 478242
after sentiment drop 439714
after duplicates drop 141923
before agg 293327 after agg 120183
before drop 293327
after sentiment drop 263212
after duplicates drop 108120


 10%|█         | 2/20 [00:29<05:15, 17.53s/it]

before agg 583026 after agg 171068
before drop 583026
after sentiment drop 538079
after duplicates drop 154205
before agg 328482 after agg 133062
before drop 328482
after sentiment drop 291658
after duplicates drop 118554


 15%|█▌        | 3/20 [00:38<04:14, 14.98s/it]

before agg 514474 after agg 172084
before drop 514474
after sentiment drop 468050
after duplicates drop 154482
before agg 312215 after agg 122226
before drop 312215
after sentiment drop 277446
after duplicates drop 108687


 20%|██        | 4/20 [00:47<03:29, 13.12s/it]

before agg 427100 after agg 127531
before drop 427100
after sentiment drop 392817
after duplicates drop 114654
before agg 173087 after agg 81553


 25%|██▌       | 5/20 [00:54<02:49, 11.31s/it]

before drop 173087
after sentiment drop 149403
after duplicates drop 72386
before agg 246532 after agg 104791
before drop 246532
after sentiment drop 218834
after duplicates drop 94145
before agg 127113 after agg 62707


 30%|███       | 6/20 [01:00<02:16,  9.73s/it]

before drop 127113
after sentiment drop 109777
after duplicates drop 55909
before agg 148616 after agg 73711
before drop 148616
after sentiment drop 128398
after duplicates drop 65934
before agg 84178 after agg 43350
before drop 84178
after sentiment drop 71925


 35%|███▌      | 7/20 [01:05<01:48,  8.31s/it]

after duplicates drop 38620
before agg 148695 after agg 67769
before drop 148695
after sentiment drop 126537
after duplicates drop 59389
before agg 79997 after agg 37254
before drop 79997
after sentiment drop 67139
after duplicates drop 32389


 40%|████      | 8/20 [01:10<01:27,  7.29s/it]

before agg 261239 after agg 94822
before drop 261239
after sentiment drop 226934
after duplicates drop 82906
before agg 145790 after agg 52410


 45%|████▌     | 9/20 [01:16<01:15,  6.88s/it]

before drop 145790
after sentiment drop 125958
after duplicates drop 45406
before agg 243310 after agg 90372
before drop 243310
after sentiment drop 210075
after duplicates drop 78779
before agg 138962 after agg 50304


 50%|█████     | 10/20 [01:22<01:05,  6.59s/it]

before drop 138962
after sentiment drop 118892
after duplicates drop 43347
before agg 268467 after agg 94596
before drop 268467
after sentiment drop 231902
after duplicates drop 82003
before agg 152775 after agg 52234


 55%|█████▌    | 11/20 [01:28<00:57,  6.37s/it]

before drop 152775
after sentiment drop 130765
after duplicates drop 44815
before agg 244178 after agg 89160
before drop 244178
after sentiment drop 209767
after duplicates drop 77170
before agg 143463 after agg 50119


 60%|██████    | 12/20 [01:34<00:49,  6.21s/it]

before drop 143463
after sentiment drop 122622
after duplicates drop 42944
before agg 295541 after agg 100690
before drop 295541
after sentiment drop 254934
after duplicates drop 86891
before agg 175344 after agg 56638


 65%|██████▌   | 13/20 [01:40<00:43,  6.24s/it]

before drop 175344
after sentiment drop 149865
after duplicates drop 48330
before agg 277026 after agg 97373
before drop 277026
after sentiment drop 237695
after duplicates drop 83907
before agg 166517 after agg 54716


 70%|███████   | 14/20 [01:47<00:37,  6.33s/it]

before drop 166517
after sentiment drop 142154
after duplicates drop 46611
before agg 292623 after agg 100148
before drop 292623
after sentiment drop 252475
after duplicates drop 86489
before agg 176835 after agg 56859


 75%|███████▌  | 15/20 [01:53<00:31,  6.39s/it]

before drop 176835
after sentiment drop 151255
after duplicates drop 48484
before agg 276318 after agg 95362
before drop 276318
after sentiment drop 238686
after duplicates drop 82530
before agg 168228 after agg 54173


 80%|████████  | 16/20 [01:59<00:25,  6.36s/it]

before drop 168228
after sentiment drop 144189
after duplicates drop 46248
before agg 303179 after agg 99920
before drop 303179
after sentiment drop 263028
after duplicates drop 86365
before agg 183198 after agg 56361


 85%|████████▌ | 17/20 [02:06<00:19,  6.42s/it]

before drop 183198
after sentiment drop 157610
after duplicates drop 47946
before agg 267055 after agg 93999
before drop 267055
after sentiment drop 229622
after duplicates drop 81157
before agg 163052 after agg 53430


 90%|█████████ | 18/20 [02:12<00:12,  6.36s/it]

before drop 163052
after sentiment drop 139196
after duplicates drop 45303
before agg 105658 after agg 52762
before drop 105658
after sentiment drop 88167
after duplicates drop 46043
before agg 63799 after agg 30507
before drop 63799
after sentiment drop 52041
after duplicates drop 26179


 95%|█████████▌| 19/20 [02:17<00:05,  5.89s/it]

before agg 237559 after agg 85837
before drop 237559
after sentiment drop 203367
after duplicates drop 74094
before agg 145842 after agg 48698


100%|██████████| 20/20 [02:23<00:00,  7.19s/it]

before drop 145842
after sentiment drop 123122
after duplicates drop 41103





In [93]:
positive_count = []
negative_count = []
neutral_count = []
overall_count = []
speech_count = []
skip_count = []
users_count = []
user_ids = []

for ymd in tqdm(overlapping_months):
    temp = df[df['ym']==ymd]
    temp = temp[temp['truncated']==False]
    temp = temp.dropna(subset=['sentiment'])
    positive_count.append(len(temp[temp['sentiment']=='positive']))
    negative_count.append(len(temp[temp['sentiment']=='negative']))
    neutral_count.append(len(temp[temp['sentiment']=='neutral']))
    speech_count.append(len(temp[temp['sentiment']=='speech']))
    skip_count.append(len(temp[temp['sentiment']=='skip']))
    overall_count.append(len(temp))
    user_ids.extend(temp['user_id_str'].unique())
    users_count.append(temp['user_id_str'].unique())

100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


In [110]:
print('Total tweets', sum(overall_count))
print('Unique users', len(set(user_ids)))
print('Tweets per user', sum(overall_count)/len(set(user_ids)))

Total tweets 10869003
Unique users 1955827
Tweets per user 5.557241514714748


In [96]:
positive_count = np.array(positive_count)
negative_count = np.array(negative_count)
neutral_count = np.array(neutral_count)
overall_count = np.array(overall_count)
speech_acount = np.array(speech_count)
skip_count = np.array(skip_count)

In [97]:
positive_count_m = np.array(positive_count_m)
negative_count_m = np.array(negative_count_m)
neutral_count_m = np.array(neutral_count_m)
overall_count_m = np.array(overall_count_m)
speech_count_m = np.array(speech_count_m)
skip_count_m = np.array(skip_count_m)

In [98]:
positive_count_f = np.array(positive_count_f)
negative_count_f = np.array(negative_count_f)
neutral_count_f = np.array(neutral_count_f)
overall_count_f = np.array(overall_count_f)
speech_count_f = np.array(speech_count_f)
skip_count_f = np.array(skip_count_f)

In [115]:

from scipy.stats.stats import pearsonr, spearmanr    

m = {

    'P-N/All plain': (positive_count-negative_count)/(overall_count),
    'P-N/All w': (positive_count_m-negative_count_m)/overall_count_m*0.47+(positive_count_f-negative_count_f)/overall_count_f*0.53,
    
    'P-N/Neu plain':  (positive_count-negative_count)/(neutral_count),
    'P-N/Neu w': (positive_count_m-negative_count_m)/(neutral_count_m)*0.47+(positive_count_f-negative_count_f)/(neutral_count_f)*0.53,
     
    'P/All plain':  (positive_count)/(overall_count),
    'P/All w': positive_count_m/overall_count_m*0.47+positive_count_f/overall_count_f*0.53,
    
    'P/Neutral plain':  (positive_count)/(neutral_count),
    'P/Neutral w': positive_count_m/neutral_count_m*0.47+positive_count_f/neutral_count_f*0.53,
     
    'N/Neutral plain':  (negative_count)/(neutral_count),
    'N/Neutral w': negative_count_m/neutral_count_m*0.47+negative_count_f/neutral_count_f*0.53,
    
    'N/All plain':  (negative_count)/(overall_count),
    'N/All w': negative_count_m/overall_count_m*0.47+negative_count_f/overall_count_f*0.53,
    
    'VCIOM Net': np.array(vciom_index)/100,
    'VCIOM PA': np.array(vciom_pa_index),
    'VCIOM NA': np.array(vciom_na_index)
}

def get_corr_func(method):
    if  method=='spearman':
        return spearmanr
    elif method=='pearson':
        return pearsonr
    elif method=='kendall':
        return stats.kendalltau
    
def get_corr(m, method='spearman'):
    print(i)
    df_corr = pd.DataFrame(m)
    df_corr = pd.DataFrame({c: df_corr[c].astype(float).diff() for c in df_corr.columns}) #[2:-2]
    print(len(df_corr))
    rho = df_corr.corr(method=method)
    pval = df_corr.corr(method=lambda x, y: get_corr_func(method)(x, y)[1]) - np.eye(*rho.shape)
    p = pval.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
    return rho.round(4).astype(str) + p
t = get_corr(m, 'spearman')[['VCIOM Net', 'VCIOM PA', 'VCIOM NA']]
print(t)
print(t.to_latex())

8
20
                  VCIOM Net   VCIOM PA    VCIOM NA
P-N/All plain        0.0854     0.1809      -0.115
P-N/All w           -0.1109     0.0321      0.1841
P-N/Neu plain        0.0458     0.1622     -0.0602
P-N/Neu w           -0.0933     0.0624      0.1664
P/All plain          0.0167    -0.0241     -0.1027
P/All w             0.469**   0.5177**     -0.2602
P/Neutral plain      0.0458    -0.0062     -0.1168
P/Neutral w        0.5332**    0.548**     -0.3292
N/Neutral plain     -0.1117    -0.1372       0.092
N/Neutral w           0.366     0.3012     -0.2496
N/All plain         -0.0211    -0.1497      0.0053
N/All w              0.3476     0.2602      -0.262
VCIOM Net            1.0***  0.7945***  -0.9055***
VCIOM PA          0.7945***     1.0***    -0.569**
VCIOM NA         -0.9055***   -0.569**      1.0***
\begin{tabular}{llll}
\toprule
{} &   VCIOM Net &   VCIOM PA &    VCIOM NA \\
\midrule
P-N/All plain   &      0.0854 &     0.1809 &      -0.115 \\
P-N/All w       &     -0.1109 & 