## Notebook: initialization.ipynb
This notebook is used for scraping data and storing it in a directory for use in our classifier.

In [3]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

reduced_columns = ['identification', 'followers_count','friends_count','favourites_count','verified',
           'default_profile_image', 'statuses_count','listed_count']

Load data, normalize json, combine with bot identification

Dataset are from [OSoMe botometer](https://botometer.osome.iu.edu/bot-repository/datasets.html)

| Dataset               | Bots  | Humans | Notes                  |
|-----------------------|-------|--------|------------------------|
| cresci-rtbust-2019    | 353   | 339    |                        |
| midterm-2018          | 42445 | 8092   |                        |
| gilani-2017           | 1089  | 1413   |                        |
| pronbots-2019         | 17881 | 0      | Spam bots              |
| vendor-purchased-2019 | 1086  | 0      | Fake follower accounts |

In [4]:
with open('../datasets/cresci-rtbust-2019_tweets.json') as jsfile:
    cresci_rtbust_2019_data = json.load(jsfile)
cresci_rtbust_2019_data = json_normalize(cresci_rtbust_2019_data)
cresci_rtbust_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
cresci_rtbust_2019_data.columns = cresci_rtbust_2019_data.columns.str.replace(r'^user\.','')

cresci_rtbust_2019_identification = pd.read_csv('../datasets/cresci-rtbust-2019.tsv', sep='\t')
cresci_rtbust_2019_identification.columns = ['user-id','identification']
cresci_rtbust_2019 = pd.merge(cresci_rtbust_2019_identification, cresci_rtbust_2019_data, how='inner', on='user-id')
cresci_rtbust_2019_bots = cresci_rtbust_2019[(cresci_rtbust_2019.identification=='bot')]
cresci_rtbust_2019_humans = cresci_rtbust_2019[(cresci_rtbust_2019.identification=='human')]

cresci_rtbust_2019.columns

Index(['user-id', 'identification', 'probe_timestamp', 'id_str', 'name',
       'screen_name', 'location', 'description', 'url',
       'entities.description.urls', 'protected', 'followers_count',
       'friends_count', 'listed_count', 'created_at', 'favourites_count',
       'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count',
       'lang', 'contributors_enabled', 'is_translator',
       'is_translation_enabled', 'profile_background_color',
       'profile_background_image_url', 'profile_background_image_url_https',
       'profile_background_tile', 'profile_image_url',
       'profile_image_url_https', 'profile_banner_url', 'profile_link_color',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image',
       'has_extended_profile', 'default_profile', 'default_profile_image',
       'following', 'follow_request_sent', 'notifications', 'translator_type',
       'entities.url.urls'],
      dtyp

In [5]:
cresci_rtbust_2019_red = cresci_rtbust_2019[reduced_columns]
cresci_rtbust_2019_red

Unnamed: 0,identification,followers_count,friends_count,favourites_count,verified,default_profile_image,statuses_count,listed_count
0,bot,289,401,213,False,False,3210,1
1,human,216,214,2266,False,False,2455,0
2,human,185,492,10584,False,False,4936,1
3,bot,9,26,140,False,False,2581,0
4,human,378,1176,1462,False,False,2282,1
...,...,...,...,...,...,...,...,...
687,human,498,699,1330,False,False,11277,9
688,bot,1041,1380,2301,False,False,28522,4
689,human,2989,1535,17223,False,False,13815,12
690,human,221,318,19043,False,False,5972,0


In [6]:
midterm_2018_data = pd.read_json('../datasets/midterm-2018_processed_user_objects.json')
midterm_2018_identification = pd.read_csv('../datasets/midterm-2018.tsv', sep='\t')
midterm_2018_identification.columns = ['user_id','identification']
midterm_2018 = pd.merge(midterm_2018_data, midterm_2018_identification, how='inner', on='user_id')
midterm_2018_bots = midterm_2018[(midterm_2018.identification=='bot')]
midterm_2018_humans = midterm_2018[(midterm_2018.identification=='human')]

midterm_2018.columns

Index(['probe_timestamp', 'user_id', 'screen_name', 'name', 'description',
       'user_created_at', 'url', 'lang', 'protected', 'verified',
       'geo_enabled', 'profile_use_background_image', 'default_profile',
       'followers_count', 'friends_count', 'listed_count', 'favourites_count',
       'statuses_count', 'tid', 'identification'],
      dtype='object')

In [7]:
# This is not actually accurate, but it may be close enough to be helpful
midterm_2018['default_profile_image'] = midterm_2018['default_profile']
midterm_2018_red = midterm_2018[reduced_columns]
midterm_2018

Unnamed: 0,probe_timestamp,user_id,screen_name,name,description,user_created_at,url,lang,protected,verified,...,profile_use_background_image,default_profile,followers_count,friends_count,listed_count,favourites_count,statuses_count,tid,identification,default_profile_image
0,Tue Nov 06 20:35:08 2018,4107317134,danitheduck21,Dani🏳️‍🌈,Dani 💜 She/Her 💜 Randomness all over. Expect l...,2015-11-03 21:16:13,,en,0.0,False,...,False,False,481,870,26,6542,67025,1059907055421509632,human,False
1,Tue Nov 06 17:57:51 2018,4858296837,ncaraballoPR,Natalie Caraballo,"Things I don’t get tired of: Politics, Amy Win...",2016-01-28 20:03:51,,en,0.0,False,...,False,False,202,712,5,1515,158,1059867472810180609,human,False
2,Tue Nov 06 20:35:23 2018,232631847,drmendezmd,Wilson,"Latin american100%! Let fight for our country,...",2010-12-31 18:55:05,,en,0.0,False,...,True,True,278,342,4,4780,4029,1059907117094711296,human,True
3,Tue Nov 06 19:23:19 2018,16700555,ScottNevins,Scott Nevins,TV Personality & Host | Political/News Contrib...,2008-10-11 21:39:34,http://www.ScottNevins.com,en,0.0,True,...,True,False,29546,384,402,143163,53427,1059888980957650944,human,False
4,Tue Nov 06 20:35:24 2018,334443152,lild1206,D,,2011-07-13 03:13:52,,en,0.0,False,...,True,True,95,668,1,1178,1315,1059907122408898562,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50532,Sat Dec 29 07:59:42 2018,1078921538739544064,Wilfredsteve1,Wilfredsteve,I'm a Widow..God bless America!\nWe rise by he...,2018-12-29 07:51:55,,en,0.0,False,...,True,True,0,4,0,3,5,1078923499413098496,bot,True
50533,Sat Dec 29 08:54:38 2018,1078923723019874304,VinceGi92042414,Vince Gill,Download Vince’s New Album -umgn.us/d2mlbh,2018-12-29 08:00:35,http://VinceGill.com,en,0.0,False,...,True,True,0,0,0,0,280,1078937323868434432,bot,True
50534,Sat Dec 29 09:02:18 2018,1078937076878503936,KenFish39349255,Ken Fisher,"Self-Made Muilt-Billionaire, Global investor, ...",2018-12-29 08:53:39,http://ken-fisher-investments.com,en,0.0,False,...,True,True,0,0,0,0,1,1078939252958875648,bot,True
50535,Sat Dec 29 10:48:30 2018,1078963374015291392,aniston_a,Stella.A.Aniston,lady aniston's twitter.\ncryptosystem investor...,2018-12-29 10:38:09,,en,0.0,False,...,True,True,1,9,0,2,4,1078965978162216961,bot,True


In [8]:
with open('../datasets/gilani-2017_tweets.json') as jsfile:
    gilani_2017_data = json.load(jsfile)
gilani_2017_data = json_normalize(gilani_2017_data)
gilani_2017_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
gilani_2017_data.columns = gilani_2017_data.columns.str.replace(r'^user\.','')

gilani_2017_identification = pd.read_csv('../datasets/gilani-2017.tsv', sep='\t')
gilani_2017_identification.columns = ['user-id','identification']
gilani_2017 = pd.merge(gilani_2017_identification, gilani_2017_data, how='inner', on='user-id')
gilani_2017_bots = gilani_2017[(gilani_2017.identification=='bot')]
gilani_2017_humans = gilani_2017[(gilani_2017.identification=='human')]

gilani_2017.columns

Index(['user-id', 'identification', 'probe_timestamp', 'id_str', 'name',
       'screen_name', 'location', 'description', 'url',
       'entities.description.urls', 'protected', 'followers_count',
       'friends_count', 'listed_count', 'created_at', 'favourites_count',
       'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count',
       'lang', 'contributors_enabled', 'is_translator',
       'is_translation_enabled', 'profile_background_color',
       'profile_background_image_url', 'profile_background_image_url_https',
       'profile_background_tile', 'profile_image_url',
       'profile_image_url_https', 'profile_banner_url', 'profile_link_color',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image',
       'has_extended_profile', 'default_profile', 'default_profile_image',
       'following', 'follow_request_sent', 'notifications', 'translator_type',
       'entities.url.urls'],
      dtyp

In [10]:
gilani_2017_red = gilani_2017[reduced_columns]
gilani_2017_red

Unnamed: 0,identification,followers_count,friends_count,favourites_count,verified,default_profile_image,statuses_count,listed_count
0,bot,26680,299,27618,False,False,113696,59
1,bot,135867,132385,630,False,False,185803,122
2,bot,27559,33,2873,False,False,13037,2
3,bot,82290,83899,13437,False,False,74352,50
4,bot,30415,2487,0,False,False,542,37
...,...,...,...,...,...,...,...,...
2497,human,978678,892,25,True,False,8616,588
2498,human,5,0,0,False,False,1,0
2499,human,918901,55697,284,False,False,55510,3073
2500,human,800196,25037,2680,True,False,27963,1111


In [11]:
with open('../datasets/pronbots-2019_tweets.json') as jsfile:
    pronbots_2019_data = json.load(jsfile)
pronbots_2019_data = json_normalize(pronbots_2019_data)
pronbots_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
pronbots_2019_data.columns = pronbots_2019_data.columns.str.replace(r'^user\.','')

pronbots_2019_identification = pd.read_csv('../datasets/pronbots-2019.tsv', sep='\t')
pronbots_2019_identification.columns = ['user-id','identification']
pronbots_2019 = pd.merge(pronbots_2019_identification, pronbots_2019_data, how='inner', on='user-id')
pronbots_2019_bots = pronbots_2019[(pronbots_2019.identification=='bot')]
pronbots_2019_humans = pronbots_2019[(pronbots_2019.identification=='human')]

pronbots_2019.columns

Index(['user-id', 'identification', 'probe_timestamp', 'follow_request_sent',
       'has_extended_profile', 'profile_use_background_image',
       'default_profile_image', 'profile_background_image_url_https',
       'verified', 'translator_type', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_fill_color',
       'entities.description.urls', 'followers_count',
       'profile_sidebar_border_color', 'id_str', 'profile_background_color',
       'listed_count', 'is_translation_enabled', 'utc_offset',
       'statuses_count', 'description', 'friends_count', 'location',
       'profile_link_color', 'profile_image_url', 'following', 'geo_enabled',
       'profile_banner_url', 'profile_background_image_url', 'screen_name',
       'lang', 'profile_background_tile', 'favourites_count', 'name',
       'notifications', 'url', 'created_at', 'contributors_enabled',
       'time_zone', 'protected', 'default_profile', 'is_translator',
       'entities.url.urls'],
      dtyp

In [13]:
pronbots_2019_red = pronbots_2019[reduced_columns]
pronbots_2019_red

Unnamed: 0,identification,followers_count,friends_count,favourites_count,verified,default_profile_image,statuses_count,listed_count
0,bot,8,0,165,False,False,44,0
1,bot,55,0,221,False,False,37,0
2,bot,21,0,300,False,False,49,0
3,bot,42,0,487,False,False,94,0
4,bot,5,0,70,False,False,9,0
...,...,...,...,...,...,...,...,...
17876,bot,8,0,49,False,False,8,0
17877,bot,9,73,56,False,False,11,0
17878,bot,24,0,176,False,False,37,0
17879,bot,34,0,225,False,False,88,0


In [14]:
with open('../datasets/vendor-purchased-2019_tweets.json') as jsfile:
    vendor_purchased_2019_data = json.load(jsfile)
vendor_purchased_2019_data = json_normalize(vendor_purchased_2019_data)
vendor_purchased_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
vendor_purchased_2019_data.columns = vendor_purchased_2019_data.columns.str.replace(r'^user\.','')

vendor_purchased_2019_identification = pd.read_csv('../datasets/vendor-purchased-2019.tsv', sep='\t')
vendor_purchased_2019_identification.columns = ['user-id','identification']
vendor_purchased_2019 = pd.merge(vendor_purchased_2019_identification, vendor_purchased_2019_data, how='inner', on='user-id')
vendor_purchased_2019_bots = vendor_purchased_2019[(vendor_purchased_2019.identification=='bot')]
vendor_purchased_2019_humans = vendor_purchased_2019[(vendor_purchased_2019.identification=='human')]

vendor_purchased_2019.columns

Index(['user-id', 'identification', 'probe_timestamp', 'follow_request_sent',
       'has_extended_profile', 'profile_use_background_image',
       'profile_background_image_url_https', 'verified', 'translator_type',
       'profile_text_color', 'profile_image_url_https',
       'profile_sidebar_fill_color', 'is_translator',
       'entities.description.urls', 'followers_count',
       'profile_sidebar_border_color', 'id_str', 'default_profile_image',
       'listed_count', 'is_translation_enabled', 'utc_offset',
       'statuses_count', 'description', 'friends_count', 'location',
       'profile_link_color', 'profile_image_url', 'notifications',
       'geo_enabled', 'profile_background_color',
       'profile_background_image_url', 'screen_name', 'lang',
       'profile_background_tile', 'favourites_count', 'name', 'url',
       'created_at', 'contributors_enabled', 'time_zone', 'protected',
       'default_profile', 'following', 'profile_banner_url',
       'entities.url.urls'],
   

In [16]:
vendor_purchased_2019_red = vendor_purchased_2019[reduced_columns]
vendor_purchased_2019_red

Unnamed: 0,identification,followers_count,friends_count,favourites_count,verified,default_profile_image,statuses_count,listed_count
0,bot,100,1011,225,False,False,93,0
1,bot,27,1641,22,False,False,66,0
2,bot,12,795,112,False,False,93,0
3,bot,69191,92197,3310,False,False,6038,96
4,bot,2081,3915,15,False,False,215,2
...,...,...,...,...,...,...,...,...
1081,bot,16,2050,10,False,True,9,0
1082,bot,48,483,153,False,False,270,1
1083,bot,341964,193488,13,False,False,6028,667
1084,bot,1,580,75,False,True,83,0


In [22]:
all_datasets = pd.concat([cresci_rtbust_2019_red, midterm_2018_red, gilani_2017_red, pronbots_2019_red, vendor_purchased_2019_red])
all_datasets

Unnamed: 0,identification,followers_count,friends_count,favourites_count,verified,default_profile_image,statuses_count,listed_count
0,bot,289,401,213,False,False,3210,1
1,human,216,214,2266,False,False,2455,0
2,human,185,492,10584,False,False,4936,1
3,bot,9,26,140,False,False,2581,0
4,human,378,1176,1462,False,False,2282,1
...,...,...,...,...,...,...,...,...
1081,bot,16,2050,10,False,True,9,0
1082,bot,48,483,153,False,False,270,1
1083,bot,341964,193488,13,False,False,6028,667
1084,bot,1,580,75,False,True,83,0


In [24]:
cresci_rtbust_2019_red.to_csv('../datasets/cresci-rtbust-2019.csv')
midterm_2018_red.to_csv('../datasets/midterm-2018.csv')
gilani_2017_red.to_csv('../datasets/gilani-2017-red.csv')
pronbots_2019_red.to_csv('../datasets/pronbots-2019.csv')
vendor_purchased_2019_red.to_csv('../datasets/vendor-purchased-2019.csv')
all_datasets.to_csv('../datasets/all-datasets.csv')