*Combine profile data, botometer and m3 demographics into one users features dataframe*

In [1]:
import os
import json
import sys
import pandas as pd
import pickle

In [2]:
# Total ids to collect
ids1 = list(pd.read_csv('data/retweeters_users.csv').user_id.values.astype('str'))
ids2 = list(pd.read_csv('data/retweeters_users_cc.csv').user_id.values.astype('str'))
ids = set(ids1 + ids2)

## Botometer

In [3]:
files = os.listdir('botometer/data')
files.remove('collected_botometer_ids.csv')
files.remove('error_botometer_ids.csv')
files.remove('nonexistent_botometer_ids.csv')

In [4]:
nonexistent = list(pd.read_csv('botometer/data/nonexistent_botometer_ids.csv', header=None)[0].values.astype('str'))
errors = list(pd.read_csv('botometer/data/error_botometer_ids.csv', header=None)[0].values.astype('str'))

botometers = []
for file in files:
    with open('botometer/data/{}'.format(file)) as json_data:
        botometers += json_data.readlines()[:]

In [5]:
def process_botometer(data):
    cap_english, cap_universal = [],[]
    raw_astroturf, raw_fake_follower, raw_financial, raw_other, raw_overall, raw_self_declared, raw_spammer = [],[],[],[],[],[],[]
    user_id, id_str, screen_name = [],[], []
    for dat in data:
        try:
            botom = json.loads(dat)
        except:
            continue
        if 'raw_scores' in botom:
            cap_english.append(botom['cap']['english'])
            cap_universal.append(botom['cap']['universal'])
            raw_astroturf.append(botom['raw_scores']['universal']['astroturf'])
            raw_fake_follower.append(botom['raw_scores']['universal']['fake_follower'])
            raw_financial.append(botom['raw_scores']['universal']['financial'])
            raw_other.append(botom['raw_scores']['universal']['other'])
            raw_overall.append(botom['raw_scores']['universal']['overall'])
            raw_self_declared.append(botom['raw_scores']['universal']['self_declared'])
            raw_spammer.append(botom['raw_scores']['universal']['spammer'])

            user_id.append(str(botom['user']['user_data']['id_str']))
            id_str.append("id_" + botom['user']['user_data']['id_str'])
            screen_name.append(botom['user']['user_data']['screen_name'])
                
    return pd.DataFrame({
        "user_id":user_id, 'id': id_str, "screen_name":screen_name, "cap_english":cap_english, "cap_universal":cap_universal,
        "astroturf":raw_astroturf, "fake_follower":raw_fake_follower, "financial":raw_financial, "other":raw_other,
        "overall":raw_overall, "self_declared":raw_self_declared, "spammer":raw_spammer
    })

In [6]:
botometer_df = process_botometer(botometers).drop_duplicates('user_id')

In [7]:
print('ids to collect: ', len(ids))
print('n botometer collected: ', len(botometer_df))
print('nonexistent user ids: ', len(nonexistent))
print('other errors user ids: ', len(errors))

ids to collect:  946835
n botometer collected:  959255
nonexistent user ids:  26498
other errors user ids:  37937


## Profiles

In [8]:
files = os.listdir('data/profiles/processed')
files.remove('.DS_Store')
profiles = []
for f in files:
    with open('data/profiles/processed/{}'.format(f)) as json_data:
        print(f)
        profiles += json_data.readlines()

profiles_clean_1.json
profiles_clean_0.json
clean_retweeters_profiles_2043.json
profiles_clean_7.json
profiles_clean_6.json
profiles_clean_5.json
clean_retweeters_profiles_2041.json
profiles_clean_8.json
retweeters_profiles_2041.json_clean.json
profiles_clean_4.json
profiles_0.json
profiles_clean_3.json
profiles_clean_2.json


In [9]:
def process_profiles(data):
    ids, screen_name = [], []
    followers_count, friends_count, statuses_count, favourites_count = [], [], [], []
    location, geo, created_at = [], [], []
    protected, verified = [], []
    lang = []
    
    for d in data:
        try:
            prof = json.loads(d)
        except:
            continue
        if "errors" in prof:
            continue
        ids.append(str(prof['id']))
        screen_name.append(prof['screen_name'])
        followers_count.append(prof['followers_count'])
        friends_count.append(prof['friends_count'])
        statuses_count.append(prof['statuses_count'])
        favourites_count.append(prof['favourites_count'])
        location.append(prof['location'])
        created_at.append(prof['created_at'])
        protected.append(prof['protected'])
        verified.append(prof['verified'])
        lang.append(prof['lang'])
        
    df = pd.DataFrame({"user_id": ids, "screen_name":screen_name, "followers_count":followers_count,\
                      "friends_count":friends_count, 'statuses_count': statuses_count, 'favourites_count': favourites_count, \
                      "location": location, "created_at": created_at, "protected": protected, "verified": verified,\
                      "lang": lang})
    
    return df

In [10]:
profiles_df = process_profiles(profiles).drop_duplicates('user_id'); len(profiles_df)

1215173

## M3 demographics

In [11]:
m3files = os.listdir('m3/processed')

In [12]:
def process_m3(data):
    ids = []
    male = []
    female = []
    age_18 = []
    age_19_29 =[]
    age_30_39 = []
    age_40 = []
    non_org = []
    is_org = []
    
    for key, value in data.items():
        ids.append(str(key))
        male.append(value['gender']['male'])
        female.append(value['gender']['female'])
        age_18.append(value['age']['<=18'])
        age_19_29.append(value['age']['19-29'])
        age_30_39.append(value['age']['30-39'])
        age_40.append(value['age']['>=40'])
        non_org.append(value['org']['non-org'])
        is_org.append(value['org']['is-org'])
    
    df = pd.DataFrame({"user_id": ids, "male":male, "female":female,"age_18":age_18, "age_19_29":age_19_29,\
                      "age_30_39":age_30_39, "age_40":age_40, "non_org":non_org, "is_org":is_org})
    
    return df

In [13]:
m3_dfs = []
for m3file in m3files:
    f = pickle.load( open( "m3/processed/"+m3file, "rb" ) )
    m3_dfs.append(process_m3(f))
m3_df = pd.concat(m3_dfs).drop_duplicates('user_id'); len(m3_df)

1215173

In [14]:
all_df = botometer_df.merge(profiles_df.drop('screen_name',axis=1), on = 'user_id', how='left'); len(all_df)
all_df = all_df.merge(m3_df, on = 'user_id', how='left'); len(all_df)

959255

In [15]:
all_df.columns

Index(['user_id', 'id', 'screen_name', 'cap_english', 'cap_universal',
       'astroturf', 'fake_follower', 'financial', 'other', 'overall',
       'self_declared', 'spammer', 'followers_count', 'friends_count',
       'statuses_count', 'favourites_count', 'location', 'created_at',
       'protected', 'verified', 'lang', 'male', 'female', 'age_18',
       'age_19_29', 'age_30_39', 'age_40', 'non_org', 'is_org'],
      dtype='object')

In [16]:
all_df.to_csv('all_users_features.csv', index=False)