# Modeling Plan

by Semin Kim. Last Update: 2017/01/07.

(joint with Isaac Solomon from Brown University)

We use the combined user data for analysis. 
Since the input dimension is quite large (~350), we first apply a random forest and logistic regression.

We test models for
- use user_info.
- use actions vector. 
    - try action types, action details, and actions
    - also try tf-idf transformation  
- use actions_sec vector similarly to actions vector.
- the combination of above. 

We only present the best model, which is
- use user_info and actions without seconds and without tf-idf. 
- apply a random forest with 20 estimators and 10 tree depth, and 100 max features. 
- apply a logistic regression. 

We achieve around 79% of accuracy from 5-fold CV from either model,
which is a 10% increase in accuracy compared to the base accuracy 69%. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train_user_session_merged.csv')

In [3]:
df.shape

(65136, 679)

In [4]:
user_info = ['gender', 'ageCat','signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'signup_app', 'first_device_type', 'first_browser']

target = 'country_destination'

actions = pd.read_csv('actions.csv', header=None, names=['action']).action.tolist()
actions_sec = [action+'_sec' for action in actions]

In [5]:
feature_columns = []
feature_columns += user_info
feature_columns += actions

X = df[feature_columns]
y = df[target]

In [6]:
X.shape

(65136, 339)

In [7]:
X.head()

Unnamed: 0,gender,ageCat,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,first_browser,...,similar_listings_v2,confirmation,signup_weibo,acculynk_load_pin_pad,acculynk_bin_check_success,acculynk_session_obtained,acculynk_pin_pad_inactive,reactivate,airbrb,desks
0,0,1,0,0,5,2,3,2,3,10,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,5,5,7,2,6,10,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,5,2,3,2,6,6,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,5,2,3,2,7,18,...,0,0,0,0,0,0,0,0,0,0
4,1,3,1,0,5,7,7,2,7,18,...,0,0,0,0,0,0,0,0,0,0


# Base Accuracy

In [8]:
y_cnt = y.value_counts()
y_cnt_percent = y_cnt/y_cnt.sum()*100
y_cnt_percent

0    69.149165
1    30.850835
Name: country_destination, dtype: float64

# Random Forest

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import time


def printFeatureImportance(clf, X,y):
    print('\n feature importance:')
    clf.fit(X, y)
    features = X.columns
    importances = clf.feature_importances_
    arg_sort = np.argsort(importances)[::-1]
    n = min(len(features), 10)
    for i in range(n):
        idx = arg_sort[i]
        print('%2d. %-*s %.2f %%' %(i+1,15, features[idx],importances[idx]*100))

def fitTree(X,y, estimator=RandomForestClassifier, param_grid=None, cv=2):
    t1 = time.time()
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
    
    clf = estimator()
     
    if param_grid:
        gs = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=cv)
        gs = gs.fit(X_train, y_train)
        clf = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
    train_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=clf, X=X_test, y=y_test, cv=cv)
    
    print('train mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(train_scores)*100, np.std(train_scores)*100))
    print('test mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(test_scores)*100, np.std(test_scores)*100))
    
    printFeatureImportance(clf,X,y)
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return clf

In [10]:
# random forest without parameter tuning
fitTree(X,y)

train mean accuracy: 76.30 % (std=0.21 %)
test mean accuracy: 76.57 % (std=0.12 %)

 feature importance:
 1. ageCat          8.37 %
 2. gender          4.61 %
 3. show            3.89 %
 4. requested       3.25 %
 5. index           3.04 %
 6. first_browser   2.90 %
 7. missing         2.79 %
 8. personalize     2.62 %
 9. pending         2.47 %
10. ajax_refresh_subtotal 2.34 %

 total time: 5.84 sec.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
# random forest with parameter tuning
param_grid = [{'n_estimators':[20], 'max_features':[100], 'max_depth':[10]}]
fitTree(X,y, param_grid=param_grid, cv=5)

best param: {'max_depth': 10, 'max_features': 100, 'n_estimators': 20}
grid search time: 24.91 sec.
train mean accuracy: 79.63 % (std=0.35 %)
test mean accuracy: 79.08 % (std=0.18 %)

 feature importance:
 1. ageCat          25.31 %
 2. requested       14.39 %
 3. pending         9.61 %
 4. missing         7.27 %
 5. gender          6.38 %
 6. signup_method   5.67 %
 7. verify          4.18 %
 8. at_checkpoint   1.72 %
 9. manage_listing  1.45 %
10. complete_status 1.37 %

 total time: 62.14 sec.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=100, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Gradient Boosting Tree

In [13]:
# gradient boosting tree without parameter tuning
fitTree(X,y, estimator=GradientBoostingClassifier, cv=5)

train mean accuracy: 79.54 % (std=0.33 %)
test mean accuracy: 78.97 % (std=0.70 %)

 feature importance:
 1. ageCat          12.36 %
 2. pending         7.55 %
 3. signup_method   5.28 %
 4. missing         4.34 %
 5. manage_listing  3.34 %
 6. affiliate_channel 3.25 %
 7. at_checkpoint   2.74 %
 8. cancellation_policies 2.69 %
 9. other_hosting_reviews_first 2.65 %
10. gender          2.56 %

 total time: 262.70 sec.


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import OneHotEncoder

def fitLR(X,y, estimator=LogisticRegression, param_grid=None, cv=2):
    t1 = time.time()
    
    ohe = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7,8,9])
    X_ohe = ohe.fit_transform(X).toarray()
    
    X_train, X_test, y_train, y_test = train_test_split(X_ohe,y, test_size=0.3)
    
    clf = Pipeline([ ('scl', StandardScaler()),
                    ('clf', estimator()) ])
    
    if param_grid:
        pipe_param_grid = []
        for param in param_grid:
            pipe_param = {}
            for key,value in param.items():
                pipe_param['clf__'+key] = value
            pipe_param_grid.append(pipe_param)
            
        gs = GridSearchCV(estimator=clf,
                    param_grid=pipe_param_grid,
                    scoring='accuracy',
                    cv=cv)
        gs = gs.fit(X_train, y_train)
        clf = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
    train_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=clf, X=X_test, y=y_test, cv=cv)
    
    print('train mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(train_scores)*100, np.std(train_scores)*100))
    print('test mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(test_scores)*100, np.std(test_scores)*100))
    
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return clf

In [21]:
# logistic regression without parameter tuning
fitLR(X, y, cv=5)

train mean accuracy: 78.65 % (std=0.19 %)
test mean accuracy: 78.75 % (std=0.47 %)

 total time: 175.60 sec.


Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

- We see that the logistic regression performly similarly to tree-based classifiers because our feature vectors have a very high dimension of 350, and easier to separate linearly.  