In [43]:
import pandas as pd
import random
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [44]:
def turn_gender_to_nums(x):
    if x == 'male':
        return 1
    elif x =='female':
        return 0

In [45]:
def turn_age_to_nums(x):
    if x == '<=18':
        return 1
    elif x == '19-29':
        return 2
    elif x == '30-39':
        return 3
    elif x == '>=40':
        return 4

In [46]:
def turn_org_to_nums(x):
    if x == 'non-org':
        return 0
    elif x == 'is-org':
        return 1

In [47]:
def fix_has_description(x):
    if x == 9:
        return 0
    else:
        return x

In [48]:
def assigning_random_unsure_age(x):
    if x == 5:
        return random.randint(1,4)
    else:
        return x

In [49]:
def assigning_random_unsure_gender(x):
    if x == 2:
        return random.randint(0,1)
    else:
        return x

In [50]:
def output_results(country_code):
    if country_code == "US":
        df = pd.read_csv('/home/manuto/Downloads/merged_demographics (1).csv')
        df.columns = ['id', 'name', 'screen_name', 'description', 'lang', 'img_path',
       'gender_predicted', 'age_predicted', 'org_predicted', 'gender_true',
       'age_true', 'org_true', 'has_picture', 'default_picture', 'has_description',
       'real_name', 'real_picture', 'bot', 'last_active']
    elif country_code == 'MX':
        df = pd.read_excel('/home/manuto/Downloads/data_MX_users/demographics_labelling_MX_AM.xlsx')
    
    df['gender_predicted'] = df['gender_predicted'].apply(turn_gender_to_nums)
    df['age_predicted'] = df['age_predicted'].apply(turn_age_to_nums)
    df['org_predicted'] = df['org_predicted'].apply(turn_org_to_nums)
    df['has_description'] = df['has_description'].apply(fix_has_description)
    #Display share of bots and discard bots
    print(f"Share of bots: {df['bot'].value_counts(dropna=False, normalize=True)[1]}")
    df = df.loc[df['bot'] != 1].reset_index(drop=True)
    # Org
    print('**** ORG *****')
    print(f"Share of organizations (non-bot users): {df['org_true'].value_counts(dropna=False, normalize=True)[1]}")
    print('Precision/Recall/F1:')
    print(precision_recall_fscore_support(df['org_true'], df['org_predicted'],average='macro'))
    print('Accuracy')
    print(accuracy_score(df['org_true'], df['org_predicted']))
    # Real name/picture
    df1 = df.loc[df['org_true'] == 0].reset_index(drop=True)
    print(f"Share of real name: {df1['real_name'].value_counts(dropna=False, normalize=True)[1]}")
    print(f"Share of real picture: {df1['real_picture'].value_counts(dropna=False, normalize=True)[1]}")
    # Gender
    print('**** GENDER ****')
    print(f"Share of unsure: {df1['gender_true'].value_counts(dropna=False, normalize=True)[2]}")
    print('Precision/Recall/F1 with unsure as extra class:')
    print(precision_recall_fscore_support(df1['gender_true'], df1['gender_predicted'], average='macro'))
    print('Accuracy with unsure as extra class')
    print(accuracy_score(df1['gender_true'], df1['gender_predicted']))
    df2 = df1.loc[df1['gender_true'] != 2].reset_index(drop=True)
    df1['gender_true'] = df1['gender_true'].apply(assigning_random_unsure_gender)
    print('Precision/Recall/F1 with unsure assigned randomly between male and female:')
    print(precision_recall_fscore_support(df1['gender_true'], df1['gender_predicted'], average='macro'))
    print('Accuracy with unsure assigned randomly between male and female:')
    print(accuracy_score(df1['gender_true'], df1['gender_predicted']))
    print('Precision/Recall/F1 without unsure:')
    print(precision_recall_fscore_support(df2['gender_true'], df2['gender_predicted'], average='macro'))
    print('Accuracy without unsure:')
    print(accuracy_score(df2['gender_true'], df2['gender_predicted']))

    # Age
    print('**** AGE ****')
    print(f"Share of unsure: {df1['age_true'].value_counts(dropna=False, normalize=True)[2]}")
    print('Precision/Recall/F1 with unsure as extra class:')
    print(precision_recall_fscore_support(df1['age_true'], df1['age_predicted'], average='macro')) 
    print('Accuracy with unsure as extra class:')
    print(accuracy_score(df1['age_true'], df1['age_predicted'])) 

    df2 = df1.loc[df1['age_true'] != 5].reset_index(drop=True)
    df1['age_true'] = df1['age_true'].apply(assigning_random_unsure_age)
    print('Precision/Recall/F1 with unsure assigned randomly between all age classes:')
    print(precision_recall_fscore_support(df1['age_true'], df1['age_predicted'], average='macro'))
    print('Accuracy with unsure assigned randomly between all age classes:')
    print(accuracy_score(df1['age_true'], df1['age_predicted']))
    print('Precision/Recall/F1 without unsure:')
    print(precision_recall_fscore_support(df2['age_true'], df2['age_predicted'], average='macro'))
    print('Accuracy without unsure:')
    print(accuracy_score(df2['age_true'], df2['age_predicted']))


In [40]:
output_results('US')

Share of bots: 0.044
**** ORG *****
Share of organizations (non-bot users): 0.15481171548117154
Precision/Recall/F1:
(0.949626168224299, 0.8218490767995719, 0.8702698511166254, None)
Accuracy
0.9414225941422594
Share of real name: 0.7821782178217822
Share of real picture: 0.7326732673267327
**** GENDER ****
Share of unsure: 0.034653465346534656
Precision/Recall/F1 with unsure as extra class:
(0.6227785959744723, 0.6473024638912489, 0.6342682304946455, None)
Accuracy with unsure as extra class
0.9356435643564357
Precision/Recall/F1 with unsure assigned randomly between male and female:
(0.959548355424644, 0.9617243760481404, 0.9602557796360058, None)
Accuracy with unsure assigned randomly between male and female:
0.9603960396039604
Precision/Recall/F1 without unsure:
(0.9679717180244829, 0.9709536958368734, 0.9690476190476189, None)
Accuracy without unsure:
0.9692307692307692
**** AGE ****
Share of unsure: 0.31683168316831684
Precision/Recall/F1 with unsure as extra class:
(0.4517670783

In [42]:
output_results('MX')

Share of bots: 0.12
**** ORG *****
Share of organizations (non-bot users): 0.03409090909090909
Precision/Recall/F1:
(0.8274509803921568, 0.8274509803921568, 0.8274509803921568, None)
Accuracy
0.9772727272727273
Share of real name: 0.7058823529411765
Share of real picture: 0.7647058823529411
**** GENDER ****
Share of unsure: 0.058823529411764705
Precision/Recall/F1 with unsure as extra class:
(0.6119733924611973, 0.65, 0.630217519106408, None)
Accuracy with unsure as extra class
0.9176470588235294
Precision/Recall/F1 with unsure assigned randomly between male and female:
(0.9650776053215078, 0.964562569213732, 0.9646863315330287, None)
Accuracy with unsure assigned randomly between male and female:
0.9647058823529412
Precision/Recall/F1 without unsure:
(0.9761904761904762, 0.975, 0.9749843652282677, None)
Accuracy without unsure:
0.975
**** AGE ****
Share of unsure: 0.36470588235294116
Precision/Recall/F1 with unsure as extra class:
(0.28199966968082907, 0.3952601201771031, 0.3173154006

In [72]:
df = pd.read_csv('/home/manuto/Downloads/merged_demographics (1).csv')
df.columns = ['id', 'name', 'screen_name', 'description', 'lang', 'img_path',
       'gender_predicted', 'age_predicted', 'org_predicted', 'gender_true',
       'age_true', 'org_true', 'has_picture', 'default_picture', 'has_description',
       'real_name', 'real_picture', 'bot', 'last_active']
df = df.loc[df['bot'] != 1]
df = df.loc[df['org_true'] == 0]
df['real_name'].value_counts(dropna=False, normalize=True)

1.0    0.782178
0.0    0.217822
Name: real_name, dtype: float64

In [81]:
df_gender_unsure = df.loc[df['gender_true']==2]
df_gender_unsure['real_name'].value_counts(normalize=True, dropna=False)

0.0    1.0
Name: real_name, dtype: float64

In [82]:
df_gender_unsure['real_picture'].value_counts(normalize=True, dropna=False)

0.0    1.0
Name: real_picture, dtype: float64

In [83]:
df_age_unsure = df.loc[df['age_true']==5]
df_age_unsure['real_name'].value_counts(normalize=True)

1.0    0.714286
0.0    0.285714
Name: real_name, dtype: float64

In [75]:
df_age_unsure['real_picture'].value_counts(normalize=True)

0.0    0.880952
1.0    0.119048
Name: real_picture, dtype: float64

In [69]:
df.shape

(202, 19)

In [70]:
df_real_name = df.loc[df['real_name']==0].reset_index(drop=True)
df_both = df_real_name[df_real_name['real_picture']==0]
df_both.shape

(18, 19)

# US

In [8]:
df = pd.read_csv('/home/manuto/Downloads/merged_demographics (1).csv')
df.columns

Index(['user_id', 'name', 'screen_name', 'description', 'lang', 'img_path',
       'gender', 'age', 'org', 'gender_true', 'age_true', 'organization_true ',
       'has_picture', 'default_picture', 'has_description', 'real_name',
       'real_picture', 'bot', 'last_active'],
      dtype='object')

In [9]:
df = pd.read_excel('/home/manuto/Downloads/data_MX_users/demographics_labelling_MX_AM.xlsx', index=False)
df.columns

Index(['id', 'name', 'screen_name', 'description', 'lang', 'img_path',
       'gender_predicted', 'age_predicted', 'org_predicted', 'gender_true',
       'age_true', 'org_true', 'default_picture', 'has_description',
       'real_name', 'real_picture', 'bot', 'last_active'],
      dtype='object')

In [97]:
df['gender'] = df['gender'].apply(turn_gender_to_nums)
df['age'] = df['age'].apply(turn_age_to_nums)
df['org'] = df['org'].apply(turn_org_to_nums)
df['has_description'] = df['has_description'].apply(fix_has_description)

In [98]:
df['last_active'].value_counts(dropna=False, normalize=True)

2020         0.404
2018         0.216
2019         0.120
2013         0.072
suspended    0.032
2016         0.032
2012         0.028
2017         0.024
not_exist    0.024
2014         0.020
2015         0.020
2011         0.004
2010         0.004
Name: last_active, dtype: float64

In [10]:
df['bot'].value_counts(dropna=False, normalize=True)[1]

0.12

In [100]:
#discard bots
df = df.loc[df['bot'] != 1]

In [101]:
df['organization_true '].value_counts(dropna=False, normalize=True)

0    0.845188
1    0.154812
Name: organization_true , dtype: float64

In [102]:
precision_recall_fscore_support(df['organization_true '], df['org'],average='macro')

(0.949626168224299, 0.8218490767995719, 0.8702698511166254, None)

In [103]:
df1 = df.loc[df['organization_true '] == 0]
df1.shape

(202, 19)

In [104]:
df1['real_name'].value_counts(dropna=False,normalize=True)

1.0    0.782178
0.0    0.217822
Name: real_name, dtype: float64

In [105]:
df1['real_picture'].value_counts(dropna=False, normalize=True)

1.0    0.732673
0.0    0.267327
Name: real_picture, dtype: float64

### Gender

In [106]:
df1['gender_true'].value_counts(dropna=False, normalize=True)

0    0.529703
1    0.435644
2    0.034653
Name: gender_true, dtype: float64

In [107]:
df2 = df1.loc[df1['gender_true']!=2]
df2['gender_true'].shape

(195,)

In [109]:
precision_recall_fscore_support(df1['gender_true'], df1['gender'], average='micro')

(0.9356435643564357, 0.9356435643564357, 0.9356435643564357, None)

In [54]:
precision_recall_fscore_support(df2['gender_true'], df2['gender'], average='macro')

(0.9679717180244829, 0.9709536958368734, 0.9690476190476189, None)

In [55]:
df1['age_true'].value_counts(dropna=False)

2    64
5    42
3    38
1    34
4    24
Name: age_true, dtype: int64

### Age

In [56]:
df1['age_true'].value_counts(dropna=False, normalize=True)

2    0.316832
5    0.207921
3    0.188119
1    0.168317
4    0.118812
Name: age_true, dtype: float64

In [None]:
df3 = df1.loc[df1['age_true']!=5]
df3.shape

In [50]:
precision_recall_fscore_support(df3['age_true'], df3['age'], average='macro')

(0.7194525482785095, 0.7291021671826625, 0.7235945400348953, None)

In [14]:
df['default_picture'].value_counts(normalize=True)

0    0.956
1    0.044
Name: default_picture, dtype: float64

In [15]:
df['has_description'].value_counts(normalize=True)

1    0.744
0    0.256
Name: has_description, dtype: float64

In [16]:
df['gender_true'].value_counts(dropna=False, normalize=True)

2    0.368
0    0.328
1    0.304
Name: gender_true, dtype: float64

In [17]:
df['age_true'].value_counts(dropna=False, normalize=True)

5    0.404
2    0.244
1    0.132
3    0.116
4    0.104
Name: age_true, dtype: float64

In [36]:
df['organization_true '].value_counts(normalize=True, dropna=False)

0    0.904
1    0.096
Name: organization_true , dtype: float64

In [35]:
df['org'].value_counts(dropna=False)

0    225
1     25
Name: org, dtype: int64

In [39]:
df1 = df
print(df1.shape)
df1 = df1.loc[df1['gender_true'] != 2]
print(df1.shape)
df1 = df1.loc[df1['age_true'] != 5]
df1 = df1.reset_index(drop=True)
print(df1.shape)

(250, 15)
(250, 15)
(250, 15)


In [40]:
print('Precision, Recall, F1')
print('***Gender***')
precision_recall_fscore_support(df1['gender_true'], df1['gender'], average='macro')

Precision, Recall, F1
***Gender***


(0.6892177589852009, 0.689801426643532, 0.6879201075475322, None)

In [41]:
precision_recall_fscore_support(df1['age_true'], df1['age'], average='macro')

(0.5977213696713503, 0.6103625541125541, 0.6001684775911158, None)

In [42]:
precision_recall_fscore_support(df1['organization_true '], df1['org'], average='macro')

(0.9133333333333333, 0.9286504424778761, 0.9208108964206525, None)

In [45]:
df['age_true'] = df['age_true'].apply(assigning_random_unsure_age)
df['gender_true'] = df['gender_true'].apply(assigning_random_unsure_gender)

In [46]:
df.shape

(250, 15)

In [47]:
precision_recall_fscore_support(df['gender_true'], df['gender'], average='macro')

(0.6892177589852009, 0.689801426643532, 0.6879201075475322, None)

In [48]:
precision_recall_fscore_support(df['age_true'], df['age'], average='macro')

(0.5977213696713503, 0.6103625541125541, 0.6001684775911158, None)

In [50]:
df.loc[(df['organization_true '] == df['org'])].shape[0]

243

In [51]:
df1.loc[(df1['organization_true '] == df1['org'])].shape[0] 

243

In [52]:
df1.shape

(250, 15)

In [53]:
df1.head()

Unnamed: 0,user_id,name,screen_name,description,lang,img_path,gender,age,org,gender_true,age_true,organization_true,has_picture,default_picture,has_description
0,10000742.0,pwnerast,pwnerast,Bastard-coated bastard with bastard filling.,en,/scratch/spf248/twitter/data/classification/US...,1,1,0,1,3,0,1,0,1
1,10021742.0,John V. Smith,JohnVSmith,Editor at Moody’s Analytics,en,/scratch/spf248/twitter/data/classification/US...,1,4,0,1,4,0,1,0,1
2,10030272.0,Sarah,SarahInMI,Princess of almost everything.,en,/scratch/spf248/twitter/data/classification/US...,0,3,0,1,3,0,1,0,1
3,100030291.0,لessica,Jessiphor,Verified Beliebers My huge inspiration follow...,en,/scratch/spf248/twitter/data/classification/US...,0,2,0,1,2,0,1,0,1
4,100058276.0,Food Lover,FoodPornDiary,I take photos of food on my phone and post it....,en,/scratch/spf248/twitter/data/classification/US...,0,2,0,0,2,0,1,0,1


# MX

In [82]:
df = pd.read_excel('/home/manuto/Downloads/data_MX_users/demographics_labelling_MX_AM.xlsx', index=False)
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,screen_name,description,lang,img_path,gender_predicted,age_predicted,org_predicted,gender_true,age_true,org_true,default_picture,has_description,real_name,real_picture,bot,last_active
0,0,1039988180714512387,Edgar Millán,edgarmusical,Cantante y músico https://t.co/PPPQQrURCc h...,es,/scratch/spf248/twitter/data/classification/MX...,male,30-39,non-org,1.0,4.0,0.0,1.0,1,1.0,1.0,,2020
1,1,1025551358,Alejandra Acosta,aleeacostarios,LTS | Gerente Compras FGA MX,es,/scratch/spf248/twitter/data/classification/MX...,female,19-29,non-org,0.0,3.0,0.0,1.0,1,1.0,1.0,,2020
2,2,1007265070961721344,Yajayra Isabel Rios Tafolla.,YajayraIsabel,,es,/scratch/spf248/twitter/data/classification/MX...,female,19-29,non-org,0.0,5.0,0.0,1.0,0,1.0,0.0,,2018
3,3,1005151027,Mycke Acevedo,donpuri49,,es,/scratch/spf248/twitter/data/classification/MX...,male,19-29,non-org,1.0,2.0,0.0,1.0,0,1.0,1.0,,2016
4,4,1018154064,DJ FUNNY DEAD,DJFUNNYDEAD,,es,/scratch/spf248/twitter/data/classification/MX...,male,30-39,non-org,2.0,2.0,0.0,1.0,0,0.0,0.0,,2012


In [83]:
#discard bots
df = df.loc[df['bot'] != 1].reset_index(drop=True)

In [84]:
df['gender_predicted'] = df['gender_predicted'].apply(turn_gender_to_nums)
df['age_predicted'] = df['age_predicted'].apply(turn_age_to_nums)
df['org_predicted'] = df['org_predicted'].apply(turn_org_to_nums)
df['has_description'] = df['has_description'].apply(fix_has_description)

In [85]:
df['last_active'].value_counts(dropna=False)

2020                 31
2019                 15
2018                 15
2013                 13
2014                  3
2012                  3
2016                  3
Account Protected     2
2015                  2
deleted               1
Name: last_active, dtype: int64

In [86]:
df.shape

(88, 19)

In [87]:
df['bot'].value_counts(dropna=False)

NaN    88
Name: bot, dtype: int64

In [15]:
#discard bots
df = df.loc[df['bot'] != 1].reset_index(drop=True)

## Org

In [89]:
df['org_true'].value_counts(dropna=False, normalize=True)

0.0    0.965909
1.0    0.034091
Name: org_true, dtype: float64

In [None]:
precision_recall_fscore_support(df['org_true'], df['org_predicted'],average='macro')

## Gender

In [19]:
df1 = df.loc[df['org_true'] == 0].reset_index(drop=True)
df1.shape

(85, 19)

In [90]:
df1['gender_true'].value_counts(dropna=False, normalize=True)

0    0.529703
1    0.435644
2    0.034653
Name: gender_true, dtype: float64

In [91]:
df1['age_true'].value_counts(dropna=False, normalize=True)

2    0.316832
5    0.207921
3    0.188119
1    0.168317
4    0.118812
Name: age_true, dtype: float64

In [21]:
df1['real_name'].value_counts(dropna=False, normalize=True)

1.0    0.705882
0.0    0.294118
Name: real_name, dtype: float64

In [22]:
df1['real_picture'].value_counts(dropna=False, normalize=True)

1.0    0.764706
0.0    0.235294
Name: real_picture, dtype: float64

In [25]:
df1.shape

(85, 19)

In [23]:
df2 = df1.loc[df1['gender_true']!=2].reset_index(drop=True)
df2['gender_true'].shape

(80,)

In [27]:
precision_recall_fscore_support(df1['gender_true'], df1['gender_predicted'], average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


(0.6119733924611973, 0.65, 0.630217519106408, None)

In [28]:
precision_recall_fscore_support(df2['gender_true'], df2['gender_predicted'], average='macro')

(0.9761904761904762, 0.975, 0.9749843652282677, None)

## Age

In [26]:
df3 = df1.loc[df1['age_true']!=5].reset_index(drop=True)
df3.shape

(69, 19)

In [29]:
precision_recall_fscore_support(df1['age_true'], df1['age_predicted'], average='macro')

(0.28199966968082907, 0.3952601201771031, 0.31731540064873404, None)