In [1]:
# Load data

import pandas as pd
import numpy as np

columns = ['favourites_count', 'profile_use_background_image', 'lang', 'followers_count',
          'protected', 'geo_enabled', 'verified', 'statuses_count', 'friends_count', 'numberoftweets',
          'percentoftweetsinwork', 'percentoftweetsinweekend', 'percentoftweetsinday', 'percentoftweetsinnight',
          'identified_as_person']
raw_data = pd.read_csv('users.csv', nrows=20000,  usecols=columns)

print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
favourites_count                20000 non-null int64
profile_use_background_image    20000 non-null object
lang                            20000 non-null object
followers_count                 20000 non-null int64
protected                       20000 non-null object
geo_enabled                     20000 non-null object
verified                        20000 non-null object
statuses_count                  20000 non-null int64
friends_count                   20000 non-null int64
numberoftweets                  19592 non-null float64
percentoftweetsinwork           17235 non-null float64
percentoftweetsinweekend        17235 non-null float64
percentoftweetsinday            17235 non-null float64
percentoftweetsinnight          17235 non-null float64
identified_as_person            20000 non-null object
dtypes: float64(5), int64(4), object(6)
memory usage: 2.3+ MB
None


In [2]:
# Replace locale
raw_data['lang'] = raw_data['lang'].str.replace(r'zh.*', 'zh')
raw_data['lang'] = raw_data['lang'].str.replace(r'en.*', 'en')
raw_data['lang'] = raw_data['lang'].str.replace(r'es.*', 'es')
print(raw_data['lang'].unique())

m = {'t': True, 'f': False}
raw_data['profile_use_background_image'] = raw_data['profile_use_background_image'].map(m)
raw_data['protected'] = raw_data['protected'].map(m)
raw_data['geo_enabled'] = raw_data['geo_enabled'].map(m)
raw_data['verified'] = raw_data['verified'].map(m)
raw_data['identified_as_person'] = raw_data['identified_as_person'].map(m)

for f in ['favourites_count', 'followers_count', 'friends_count', 'numberoftweets',
          'percentoftweetsinwork', 'percentoftweetsinweekend', 'percentoftweetsinday',
          'percentoftweetsinnight']:
    raw_data = raw_data[np.isfinite(raw_data[f])]
# print(raw_data.numberoftweets[raw_data.numberoftweets.isnull()])

print(raw_data.info())

['es' 'en' 'uk' 'pl' 'tr' 'ar' 'de' 'ko' 'cs' 'ca' 'ru' 'zh' 'pt' 'fr' 'no'
 'id' 'ro' 'ja' 'sv' 'it' 'da' 'fi' 'nl' 'hu' 'th' 'sk' 'fil' 'bg' 'vi'
 'el' 'bn' 'he' 'hr' 'sr' 'nb' 'hi' 'fa' 'lv' 'ta' 'sl']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17235 entries, 0 to 19985
Data columns (total 15 columns):
favourites_count                17235 non-null int64
profile_use_background_image    17235 non-null bool
lang                            17235 non-null object
followers_count                 17235 non-null int64
protected                       17235 non-null bool
geo_enabled                     17235 non-null bool
verified                        17235 non-null bool
statuses_count                  17235 non-null int64
friends_count                   17235 non-null int64
numberoftweets                  17235 non-null float64
percentoftweetsinwork           17235 non-null float64
percentoftweetsinweekend        17235 non-null float64
percentoftweetsinday            17235 non-null 

In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

quant = 0.95
fields = ['favourites_count', 'followers_count', 'statuses_count', 'friends_count', 'numberoftweets']

# Remove top 5% of samples from following columns
for field in fields:
    q = raw_data[field].quantile(quant)
    raw_data = raw_data[(raw_data[field] < q)] 
    



In [5]:
# One hot encoding
ml_data = pd.get_dummies(raw_data, columns=['lang']).values.astype(float)
# 14 col is label

train_set, test_set = train_test_split(ml_data, train_size = 0.8)
print(ml_data.shape)
idx = [i for i in range(np.shape(ml_data)[1]) if i not in [13]]
train_x = train_set[:, idx]   # all but 13th column
train_y = train_set[:, 13]

test_x = test_set[:, idx]
test_y = test_set[:, 13]


train_x = scale(train_x)
test_x = scale(test_x)

train_y = np.ravel(train_y)
test_y = np.ravel(test_y)

print("Person/Not person ratio in training samples: %s" % (len(train_y[train_y==1])/len(train_y[train_y==0])))

(13334, 53)
Person/Not person ratio in training samples: 0.7020903143449817


In [6]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense

np.random.seed(1337)

Using TensorFlow backend.
  return f(*args, **kwds)


In [7]:
model = Sequential()
model.add(Dense(100, input_dim=train_x.shape[1], activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, epochs=15, batch_size=100)

scores = model.evaluate(test_x, test_y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

acc: 65.09%
