In [328]:
import pandas as pd
import numpy as np

| Feature                   | Data Type | Description                                                                   |
|---------------------------|-----------|-------------------------------------------------------------------------------|
| has_name                  | boolean   | Checks if account has a name                                                  |
| has_image                 | boolean   | Checks if account has all three images, profile background and something else |
| has_address               | boolean   | Checks if account has address                                                 |
| has_bio                   | boolean   | Checks if account has a bio                                                   |
| profile_has_url           | boolean   | Checks if account has a URL in the profile description                        |
| present_in_list           | boolean   | Checks if account is a part of a list for another account                     |
| follower_count            | integer   | The number of accounts following the considered account                       |
| tweet_count               | integer   | The number of tweets posted by the account                                    |
| friends_to_follower_ratio | float     | The ratio of friends to followers, if followers are 0 then set to 1,00,000    |
| Label                     | boolean   | Tells if the account is real or not                                           |


In [329]:
path = 'TFP.csv'

df = pd.read_csv(path + '/users.csv')
df.head()

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,url,...,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset
0,24503,Roberto Bonanzinga,Bonanzinga,4340,5055,1466,267,256,Mon Nov 27 06:55:12 +0000 2006,http://t.co/32VHs0bxbY,...,99CC33,http://a0.twimg.com/profile_background_images/...,352726,D02B55,-28800.0,,,Partner at Balderton Capital (formerly Benchma...,2015-02-14 11:32:57,TFP
1,22903,effeffe,effeffe,164,132,194,12,4,Sun Nov 26 15:19:32 +0000 2006,http://t.co/CX7EHdk9DJ,...,EFEFEF,http://a0.twimg.com/images/themes/theme14/bg.gif,131516,009999,3600.0,,,L'uomo ha creato dio a sua immagine e somiglia...,2015-02-14 11:32:57,TFP
2,382393,Ciro Cattuto,ciro,1070,1154,832,871,92,Sun Dec 31 02:03:17 +0000 2006,http://t.co/26dNjSYe5Q,...,DAECF4,http://a0.twimg.com/images/themes/theme2/bg.gif,C6E2EE,1F98C7,3600.0,,,Research Director at ISI Foundation. Data scie...,2015-02-14 11:32:57,TFP
3,286543,Alessio Bragadini,abragad,6892,930,535,478,28,Wed Dec 27 14:55:17 +0000 2006,http://t.co/xoOdZi9jic,...,E0FF92,http://a0.twimg.com/profile_background_images/...,9AE4E8,0000FF,3600.0,,,Web and social media developer from Italy,2015-02-14 11:32:57,TFP
4,438023,fullcaffeine,fullcaffeine,2885,173,444,41,2,Tue Jan 02 09:01:50 +0000 2007,http://www.fullcaffeine.com,...,EFEFEF,http://a0.twimg.com/images/themes/theme14/bg.gif,131516,009999,3600.0,,,,2015-02-14 11:32:57,TFP


* [x] id
* [x] tweet_count
* [x] has_name
* [x] has_image
* [x] has_address
* [x] has_bio
* [x] profile_has_url
* [x] present_in_list
* [x] follower_count
* [x] friends_to_follower_ratio
* [x] Label

In [330]:
# convert relevant columns to list

# id, friends_count, followers_count
id_list = df['id'].to_list()
friends_count_list = df['friends_count'].to_list()
followers_count_list = df['followers_count'].to_list()
labels = [1] * len(id_list)

In [331]:
# has_name
df['has_name'] = int(len(df['name']) != 0 and len(df['screen_name']) != 0)
has_name_list = df['has_name'].to_list()

In [332]:
# has_image

# df['profile_image_url'].isnull()
# since we do not have any null values in profile_image_url, we can convert this column into a list of boolean values
# has_image = df['profile_image_url'].isnull().to_list()
# image_url = df['profile_image_url'].to_list()
has_image = df['default_profile_image'].to_list()
# df['profile_image_url'].isnull().values.any()

length = len(has_image)
for i in range(length):
    if has_image[i] != 1.0:
        has_image[i] = 1
    else:
        has_image[i] = 0

In [333]:
# has_address

has_address_list = df['location'].isnull().to_list()

length = len(has_address_list)
for i in range(length):
    has_address_list[i] = 1 - int(has_address_list[i])

In [334]:
# has_bio

has_bio_list = df['description'].isnull().to_list()

length = len(has_bio_list)
for i in range(length):
    has_bio_list[i] = 1 - int(has_bio_list[i])

In [335]:
# profile_has_url

profile_has_url = df['url'].isnull().to_list()

length = len(profile_has_url)
for i in range(length):
    profile_has_url[i] = 1 - int(profile_has_url[i])

In [336]:
# present_in_list
present_in_list = df['listed_count'].to_list()

length = len(present_in_list)
for i in range(length):
    if present_in_list[i] > 0:
        present_in_list[i] = 1

In [337]:
# friends_to_followers_ratio_list

friends_to_followers_ratio_list = [0] * len(id_list)

length = len(friends_count_list)
for i in range(length):
    if(followers_count_list[i] == 0):
        friends_to_followers_ratio_list[i] = 100000 # basically an upper cap to infinity
    else:
        friends_to_followers_ratio_list[i] = (friends_count_list[i]) / (followers_count_list[i])

In [338]:
# tweet_count

import collections
import pandas as pd 

# a = [1,1,1,1,2,2,2,2,3,3,4,5,5]

df = pd.read_csv(path + '/tweets.csv')
lst = df['user_id'].to_list()
tweet_count = collections.Counter(lst)

# 111

tweet_count_list = []

for user in id_list:
    if user in tweet_count.keys():
        tweet_count_list.append(tweet_count[user])
    else:
        tweet_count_list.append(0)

# print(len(tweet_count))
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
# print(counter.values())
# [4, 4, 2, 1, 2]
# print(counter.keys())
# [1, 2, 3, 4, 5]


In [339]:
# convert all lists to columns of a new dataframe

import pandas as pd

df2 = pd.DataFrame({'user_id':id_list})
print (df2)

        user_id
0         24503
1         22903
2        382393
3        286543
4        438023
..          ...
464  1010894497
465    39588706
466    13972932
467    90626911
468   321257315

[469 rows x 1 columns]


In [340]:
df2['tweets_count'] = tweet_count_list

In [341]:
df2.head()

Unnamed: 0,user_id,tweets_count
0,24503,3531
1,22903,164
2,382393,1069
3,286543,3404
4,438023,2884


In [342]:
df2['followers_count'] = followers_count_list
df2['friends_count'] = friends_count_list
df2['has_name'] = has_name_list
df2['has_image'] = has_image
df2['has_address'] = has_address_list
df2['has_bio'] = has_bio_list
df2['profile_has_url'] = profile_has_url
df2['present_in_list'] = present_in_list
df2['friends_to_followers_ratio'] = friends_to_followers_ratio_list
df2['label'] = labels

In [343]:
df2.head(100)

Unnamed: 0,user_id,tweets_count,followers_count,friends_count,has_name,has_image,has_address,has_bio,profile_has_url,present_in_list,friends_to_followers_ratio,label
0,24503,3531,5055,1466,1,1,1,1,1,1,0.290010,1
1,22903,164,132,194,1,1,1,1,1,1,1.469697,1
2,382393,1069,1154,832,1,1,1,1,1,1,0.720971,1
3,286543,3404,930,535,1,1,1,1,1,1,0.575269,1
4,438023,2884,173,444,1,1,1,0,1,1,2.566474,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,50650272,272,95,133,1,1,1,1,0,1,1.400000,1
96,51065893,1288,75,281,1,1,0,0,0,0,3.746667,1
97,51169895,3247,675,373,1,1,1,1,1,1,0.552593,1
98,51484684,2021,143,371,1,1,1,1,1,1,2.594406,1


In [344]:
df2.to_csv('TFP-formatted.csv')