In [23]:
import pandas as pd
import numpy as np

|                           | Data Type | Description                                                                   |
|---------------------------|-----------|-------------------------------------------------------------------------------|
| has_name                  | boolean   | Checks if account has a name                                                  |
| has_image                 | boolean   | Checks if account has all three images, profile background and something else |
| has_address               | boolean   | Checks if account has address                                                 |
| has_bio                   | boolean   | Checks if account has a bio                                                   |
| profile_has_url           | boolean   | Checks if account has a URL in the profile description                        |
| present_in_list           | boolean   | Checks if account is a part of a list for another account                     |
| follower_count            | integer   | The number of accounts following the considered account                       |
| tweet_count               | integer   | The number of tweets posted by the account                                    |
| friends_to_follower_ratio | float     | The ratio of friends to followers, if followers are 0 then set to 1,00,000    |
| Label                     | boolean   | Tells if the account is real or not                                           |


In [24]:
path = 'E13.csv'

df = pd.read_csv(path + '/users.csv')
df.head()

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,url,...,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset
0,3610511,Davide Dellacasa,braddd,20370,5470,2385,145,52,Fri Apr 06 10:58:22 +0000 2007,http://braddd.tumblr.com,...,FFF7CC,http://a0.twimg.com/profile_background_images/...,BADFCD,FF0000,3600.0,,,Founder of http://www.screenweek.it & http://w...,2015-02-14 10:54:49,E13
1,5656162,Simone Economo,eKoeS,3131,506,381,9,40,Mon Apr 30 15:08:42 +0000 2007,http://www.lineheight.net/,...,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,3600.0,,,BSc degree (cum laude) in Computer Engineering...,2015-02-14 10:54:49,E13
2,5682702,tacone,tacone_,4024,264,87,323,16,Tue May 01 11:53:40 +0000 2007,http://t.co/LKrl1dZE,...,000000,http://a0.twimg.com/profile_background_images/...,1A1B1F,2FC2EF,3600.0,,,Cogito ergo bestemmio.,2015-02-14 10:54:49,E13
3,6067292,alesaura,alesstar,40586,640,622,1118,32,Tue May 15 16:55:16 +0000 2007,http://alesstar.wordpress.com/,...,95E8EC,http://a0.twimg.com/images/themes/theme4/bg.gif,0099B9,0099B9,3600.0,,,"Se la vita ti dà sarde, scapocciale!",2015-02-14 10:54:49,E13
4,6015122,Angelo,PerDiletto,2016,62,64,13,0,Sun May 13 19:52:00 +0000 2007,http://www.flickr.com/per_diletto,...,F6F6F6,http://a0.twimg.com/images/themes/theme18/bg.gif,ACDED6,038543,3600.0,,,Je me souviens,2015-02-14 10:54:49,E13


* [x] id
* [x] tweet_count
* [x] has_name
* [x] has_image
* [x] has_address
* [x] has_bio
* [x] profile_has_url
* [x] present_in_list
* [x] follower_count
* [x] friends_to_follower_ratio
* [x] Label

In [25]:
# convert relevant columns to list

# id, friends_count, followers_count
id_list = df['id'].to_list()
friends_count_list = df['friends_count'].to_list()
followers_count_list = df['followers_count'].to_list()
labels = [1] * 1481

In [26]:
# has_name
df['has_name'] = int(len(df['name']) != 0 and len(df['screen_name']) != 0)
has_name_list = df['has_name'].to_list()

In [27]:
# has_image

df['profile_image_url'].isnull()
# since we do not have any null values in profile_image_url, we can convert this column into a list of boolean values

has_image = [1] * 1481

In [28]:
# has_address

has_address_list = df['location'].isnull().to_list()

length = len(has_address_list)
for i in range(length):
    has_address_list[i] = 1 - int(has_address_list[i])

In [29]:
# has_bio

has_bio_list = df['description'].isnull().to_list()

length = len(has_bio_list)
for i in range(length):
    has_bio_list[i] = 1 - int(has_bio_list[i])

In [30]:
# profile_has_url

profile_has_url = df['url'].isnull().to_list()

length = len(profile_has_url)
for i in range(length):
    profile_has_url[i] = 1 - int(profile_has_url[i])

In [31]:
# present_in_list
present_in_list = df['listed_count'].to_list()

length = len(present_in_list)
for i in range(length):
    if present_in_list[i] > 0:
        present_in_list[i] = 1

In [32]:
# friends_to_followers_ratio_list

friends_to_followers_ratio_list = [0] * 1481

length = len(friends_count_list)
for i in range(length):
    if(followers_count_list[i] == 0):
        friends_to_followers_ratio_list[i] = 100000 # basically an upper cap to infinity
    else:
        friends_to_followers_ratio_list[i] = (friends_count_list[i]) / (followers_count_list[i])

In [33]:
# tweet_count

import collections
import pandas as pd 

# a = [1,1,1,1,2,2,2,2,3,3,4,5,5]

df = pd.read_csv(path + '/tweets.csv')
lst = df['user_id'].to_list()
tweet_count = collections.Counter(lst)

# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
# print(counter.values())
# [4, 4, 2, 1, 2]
# print(counter.keys())
# [1, 2, 3, 4, 5]

In [44]:
# convert all lists to columns of a new dataframe

import pandas as pd

df2 = pd.DataFrame({'user_id':id_list})
print (df2)

         user_id
0        3610511
1        5656162
2        5682702
3        6067292
4        6015122
...          ...
1476  1127280169
1477  1156344000
1478  1169114810
1479  1212975186
1480  1213937306

[1481 rows x 1 columns]


In [45]:
df2['tweets_count'] = counter.values()

In [46]:
df2.head()

Unnamed: 0,user_id,tweets_count
0,3610511,3057
1,5656162,3128
2,5682702,3158
3,6067292,2012
4,6015122,3098


In [47]:
df2['followers_count'] = followers_count_list
df2['friends_count'] = friends_count_list
df2['has_name'] = has_name_list
df2['has_image'] = has_image
df2['has_address'] = has_address_list
df2['has_bio'] = has_bio_list
df2['profile_has_url'] = profile_has_url
df2['present_in_list'] = present_in_list
df2['friends_to_followers_ratio'] = friends_to_followers_ratio_list
df2['label'] = labels

In [48]:
df2.head()

Unnamed: 0,user_id,tweets_count,followers_count,friends_count,has_name,has_image,has_address,has_bio,profile_has_url,present_in_list,friends_to_followers_ratio,label
0,3610511,3057,5470,2385,1,1,1,1,1,1,0.436015,1
1,5656162,3128,506,381,1,1,1,1,1,1,0.752964,1
2,5682702,3158,264,87,1,1,1,1,1,1,0.329545,1
3,6067292,2012,640,622,1,1,0,1,1,1,0.971875,1
4,6015122,3098,62,64,1,1,1,1,1,0,1.032258,1


In [49]:
df2.to_csv('E13-formatted.csv')