In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
users = pd.read_csv('../data/users_clean.csv')

In [3]:
users.head()

Unnamed: 0,id,date_account_created,year_created,month_created,timestamp_first_active,active_created_duration,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination
0,820tgsjxq7,2011-05-25,2011,5,2009-05-23,732,Male,38.0,facebook,0,en,seo,google,Web,Mac Desktop,NDF
1,4ft3gnwmtx,2010-09-28,2010,9,2009-06-09,476,Female,56.0,basic,3,en,direct,direct,Web,Windows Desktop,US
2,bjjt8pjhuk,2011-12-05,2011,12,2009-10-31,765,Female,42.0,facebook,0,en,direct,direct,Web,Mac Desktop,other
3,lsw9q7uk0j,2010-01-02,2010,1,2010-01-02,0,Female,46.0,basic,0,en,other,craigslist,Web,Mac Desktop,US
4,0d01nltbrs,2010-01-03,2010,1,2010-01-03,0,Female,47.0,basic,0,en,direct,direct,Web,Mac Desktop,US


### Preprocessing the data

In [4]:
df = users[['year_created', 'month_created', 'gender', 'age', 'signup_method',
         'affiliate_channel', 'affiliate_provider', 'signup_app', 'first_device_type', 'country_destination']]

In [5]:
df.head()

Unnamed: 0,year_created,month_created,gender,age,signup_method,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination
0,2011,5,Male,38.0,facebook,seo,google,Web,Mac Desktop,NDF
1,2010,9,Female,56.0,basic,direct,direct,Web,Windows Desktop,US
2,2011,12,Female,42.0,facebook,direct,direct,Web,Mac Desktop,other
3,2010,1,Female,46.0,basic,other,craigslist,Web,Mac Desktop,US
4,2010,1,Female,47.0,basic,direct,direct,Web,Mac Desktop,US


In [6]:
# Import preprocessing tools from sklearn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

X = df[['year_created', 'month_created', 'gender', 'age', 'signup_method',
         'affiliate_channel', 'affiliate_provider', 'signup_app', 'first_device_type']]

y = df['country_destination']

In [7]:
# Get dummy variables for all the categorical independent variables
X = pd.get_dummies(X, columns= ['year_created', 'month_created', 'gender', 'signup_method',
         'affiliate_channel', 'affiliate_provider', 'signup_app', 'first_device_type'])

In [8]:
X.head()

Unnamed: 0,age,year_created_2010,year_created_2011,year_created_2012,year_created_2013,year_created_2014,month_created_1,month_created_2,month_created_3,month_created_4,...,signup_app_iOS,first_device_type_Android Phone,first_device_type_Android Tablet,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone
0,38.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,56.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,42.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,46.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,47.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
# Normalize age so that it's within scale with the rest of the variables
scaler = MinMaxScaler()
X['age'] = scaler.fit_transform(X['age'])



In [10]:
X.head()

Unnamed: 0,age,year_created_2010,year_created_2011,year_created_2012,year_created_2013,year_created_2014,month_created_1,month_created_2,month_created_3,month_created_4,...,signup_app_iOS,first_device_type_Android Phone,first_device_type_Android Tablet,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone
0,0.377551,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0.561224,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.418367,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.459184,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.469388,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Transform the target variable using Label Encoder
le = LabelEncoder()
le.fit_transform(y)

y = le.fit_transform(y)
y

array([ 7, 10, 11, ..., 10, 10,  7])

In [12]:
# Import cross validation
from sklearn.model_selection import train_test_split

# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

### Model Fitting and Evaluation

In [13]:
# Import predictive models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# Import classification report
from sklearn.metrics import classification_report

#### Naive Bayes

In [14]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.02      0.00       114
          1       0.01      0.01      0.01       250
          2       0.00      0.01      0.01       216
          3       0.00      0.00      0.00       395
          4       0.00      0.00      0.00       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.80      0.00      0.01     15629
          8       0.00      0.01      0.00       159
          9       0.00      0.89      0.00        37
         10       0.00      0.00      0.00     11954
         11       0.00      0.00      0.00      1886

avg / total       0.38      0.00      0.00     32517



  'precision', 'predicted', average, warn_for)


#### K-Nearest Neighbors

In [17]:
clf = KNeighborsClassifier(n_neighbors= 5, weights= 'uniform')
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
y_pred = clf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       114
          1       0.01      0.00      0.01       250
          2       0.02      0.00      0.01       216
          3       0.02      0.01      0.01       395
          4       0.02      0.01      0.01       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.53      0.70      0.60     15629
          8       0.00      0.00      0.00       159
          9       0.00      0.00      0.00        37
         10       0.43      0.40      0.41     11954
         11       0.07      0.01      0.02      1886

avg / total       0.42      0.48      0.44     32517



  'precision', 'predicted', average, warn_for)


#### Support Vector Machine

In [20]:
clf = SVC(kernel= 'linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
y_pred = clf.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       114
          1       0.00      0.00      0.00       250
          2       0.00      0.00      0.00       216
          3       0.00      0.00      0.00       395
          4       0.00      0.00      0.00       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.60      0.66      0.63     15629
          8       0.00      0.00      0.00       159
          9       0.00      0.00      0.00        37
         10       0.46      0.58      0.51     11954
         11       0.00      0.00      0.00      1886

avg / total       0.46      0.53      0.49     32517



  'precision', 'predicted', average, warn_for)


#### Decision Tree

In [23]:
clf = DecisionTreeClassifier(min_samples_split= 40)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=40, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
y_pred = clf.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       114
          1       0.00      0.00      0.00       250
          2       0.00      0.00      0.00       216
          3       0.00      0.00      0.00       395
          4       0.06      0.01      0.01       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.55      0.69      0.61     15629
          8       0.00      0.00      0.00       159
          9       0.00      0.00      0.00        37
         10       0.44      0.46      0.45     11954
         11       0.05      0.00      0.01      1886

avg / total       0.43      0.50      0.46     32517



  'precision', 'predicted', average, warn_for)


#### Random Forest

In [26]:
clf = RandomForestClassifier(max_depth= 2, random_state= 0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [27]:
y_pred = clf.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       114
          1       0.00      0.00      0.00       250
          2       0.00      0.00      0.00       216
          3       0.00      0.00      0.00       395
          4       0.00      0.00      0.00       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.48      1.00      0.65     15629
          8       0.00      0.00      0.00       159
          9       0.00      0.00      0.00        37
         10       0.80      0.00      0.00     11954
         11       0.00      0.00      0.00      1886

avg / total       0.53      0.48      0.31     32517



  'precision', 'predicted', average, warn_for)


#### AdaBoost

In [29]:
clf = AdaBoostClassifier(n_estimators= 100)
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [30]:
y_pred = clf.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.02      0.00       114
          1       0.00      0.00      0.00       250
          2       0.00      0.00      0.00       216
          3       0.00      0.00      0.00       395
          4       0.00      0.00      0.00       948
          5       0.00      0.00      0.00       451
          6       0.00      0.00      0.00       478
          7       0.57      0.68      0.62     15629
          8       0.00      0.00      0.00       159
          9       0.00      0.00      0.00        37
         10       0.48      0.49      0.48     11954
         11       0.00      0.00      0.00      1886

avg / total       0.45      0.51      0.48     32517



  'precision', 'predicted', average, warn_for)
