In [1]:
import pandas as pd
import numpy as np

import functions as f

In [2]:
import importlib
importlib.reload(f)

<module 'functions' from 'functions.py'>

## Merge all panel datasets and anonymize

In [3]:
dtypes = {'userid':'str'}

panel_bios = pd.read_csv(f'data/panel_bios_2021_matched.tsv',
                         sep = "\t",
                        index_col=0,
                        dtype = dtypes)

In [4]:
attention_df = pd.read_csv(f'data/raw/attention_panel.tsv',
                         sep = "\t",
                         dtype = dtypes)

attention_df = attention_df.loc[~(attention_df.retweet_sum == "retweet_sum")]
attention_df = attention_df.astype({"retweet_sum":np.float64, 
                "likes_sum":np.float64,
                "retweet_avg":np.float64,
                "likes_avg":np.float64,
                "followers":np.float64,
                "followees":np.float64})

In [5]:
handcoded_bios = pd.read_csv(f'data/handcoded_data_2023_info_matched.tsv', 
                             sep = "\t",
                             index_col=0,
                             dtype = dtypes)

In [6]:
panel_bios = panel_bios.merge(handcoded_bios, on = "userid", how = "outer")

In [7]:
panel_bios = attention_df.drop(["handle", "display_name", "n_tweets"], axis = 1).merge(panel_bios, 
                                                                                       on = "userid", 
                                                                                       how="right")

In [8]:
# bin age
bins = np.arange(15, 80, 10)
labels = ['<26', '26-35', '36-45', '46-55', '56-65', '75+']
panel_bios['age_group'] = pd.cut(panel_bios['age'], bins=bins, labels=labels, right=False)

In [9]:
# drop extra columns
cols_drop = ['age', 'retweet_sum', 'likes_sum', 'followees', 'userid', 'handle', 'display_name', 'bio', 'state', 'county', 'race', 'party_score', 'tokens']
panel_bios = panel_bios.drop(axis = 1, columns = cols_drop)
panel_bios.rename(columns={'age_group':'age'}, inplace=True)

In [10]:
def add_noise(col, factor):
    scale = np.std(col)
    
    col = col + col.apply(lambda x: np.random.normal(scale = scale/factor))
    return(col)

In [11]:
panel_bios["retweet_avg"] = add_noise(panel_bios.retweet_avg)
panel_bios["likes_avg"] = add_noise(panel_bios.likes_avg)
panel_bios["followers"] = add_noise(panel_bios.followers)
panel_bios["n_tweets"] = add_noise(panel_bios.n_tweets)

In [12]:
# inspect noise-added data
panel_bios.head()

Unnamed: 0,retweet_avg,likes_avg,followers,voter_file_sex,n_tweets,theythem_pronouns,mixed_pronouns,hehim_pronouns,sheher_pronouns,anypronoun,...,Coder3_gender,Coder4_gender,code_count_gender,Coder1_trans,Coder2_trans,Coder3_trans,Coder4_trans,code_count_trans,n_coders,age
0,0.825361,6.610801,1833.557117,Female,867.470874,False,True,False,False,True,...,,,,,,,,,,46-55
1,0.185935,3.476215,2098.731172,Male,277.77148,False,False,False,False,False,...,,,,,,,,,,46-55
2,0.076869,2.663392,1718.645342,Male,20.272148,False,False,False,False,False,...,,,,,,,,,,36-45
3,-0.028336,1.963529,782.03264,Male,452.695085,False,False,False,False,False,...,,,,,,,,,,56-65
4,0.146673,0.280796,137.274636,Female,8.543976,False,False,False,False,False,...,,,,,,,,,,75+


Testing whether the stats turn out to be similar to previously calculated stats: everything is close, except for medians, where adding normal noise to positive values makes them get all higher, and the order of what categories have higher/lower medians does change a bit



In [13]:
f.calc_stats("pronoun_gender", panel_bios)

Unnamed: 0_level_0,mean_retweet_avg,median_retweet_avg,mean_likes_avg,median_likes_avg,mean_followers,median_followers,mean_ntweets,median_ntweets,n_users
pronoun_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,0.72,0.12,7.78,2.24,1581.95,426.03,937.35,229.72,16058
Male,0.61,0.11,6.45,2.11,1877.93,405.45,1314.61,347.88,8450
Mixed_pro,0.57,0.1,5.98,2.15,1090.72,331.56,1252.07,341.82,1297
Non-binary,0.65,0.1,6.33,2.06,1408.27,287.65,1313.37,308.03,864
,0.41,0.05,3.58,0.87,1146.58,243.5,391.12,39.15,555314


In [14]:
f.calc_stats("code_gender", panel_bios)

Unnamed: 0_level_0,mean_retweet_avg,median_retweet_avg,mean_likes_avg,median_likes_avg,mean_followers,median_followers,mean_ntweets,median_ntweets,n_users
code_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,0.42,0.09,3.97,1.53,1536.6,392.44,792.72,109.43,2162
Male,0.57,0.09,4.37,1.4,1668.08,351.64,892.7,114.99,2126
Mixed,0.44,0.07,4.82,1.51,883.41,277.56,812.13,119.48,396
Non-binary,0.41,0.06,3.94,2.03,675.82,415.91,1701.53,915.16,15
Not sure,0.98,0.09,10.49,1.49,1439.38,237.77,1203.49,140.92,123
,0.42,0.05,3.75,0.91,1166.98,248.92,419.3,41.31,577161


In [15]:
panel_bios.columns

Index(['retweet_avg', 'likes_avg', 'followers', 'voter_file_sex', 'n_tweets',
       'theythem_pronouns', 'mixed_pronouns', 'hehim_pronouns',
       'sheher_pronouns', 'anypronoun', 'pronouns', 'gendered_words',
       'pronoun_gender', 'word_gender', 'pronoun_word_gender', 'code_gender',
       'code_trans', 'pronoun_gender_2023', 'word_gender_2023',
       'pronoun_word_gender_2023', 'Coder1_gender', 'Coder2_gender',
       'Coder3_gender', 'Coder4_gender', 'code_count_gender', 'Coder1_trans',
       'Coder2_trans', 'Coder3_trans', 'Coder4_trans', 'code_count_trans',
       'n_coders', 'age'],
      dtype='object')

In [16]:
panel_bios.to_csv(f'data/panel_bios_anonymized.tsv', sep = "\t")

## Anonymize decahose data

In [17]:
decahose_bios = pd.read_csv(f'data/decahose_bios_2021_downsample_matched.tsv',
                            sep = "\t",
                           index_col=0)


In [18]:
cols_drop = ['userid', 'handle', 'display_name', 'bio', 'tokens']
decahose_bios = decahose_bios.drop(axis = 1, columns = cols_drop)

In [19]:
decahose_bios.head()

Unnamed: 0,theythem_pronouns,mixed_pronouns,hehim_pronouns,sheher_pronouns,anypronoun,pronouns,gendered_words,pronoun_gender,word_gender,pronoun_word_gender
0,False,False,False,False,False,[],,,,
1,False,False,False,False,False,[],,,,
2,False,False,False,False,False,[],,,,
3,False,False,False,False,False,[],,,,
4,False,False,False,False,False,[],,,,


In [20]:
decahose_bios.columns

Index(['theythem_pronouns', 'mixed_pronouns', 'hehim_pronouns',
       'sheher_pronouns', 'anypronoun', 'pronouns', 'gendered_words',
       'pronoun_gender', 'word_gender', 'pronoun_word_gender'],
      dtype='object')

In [21]:
decahose_bios.to_csv(f'data/decahose_bios_anonymized.tsv', sep = "\t")