# Modeling Plan

by Semin Kim.

(joint with Isaac Solomon from Brown University)

We use the combined user data for analysis. 
Since the input dimension is quite large (~350), we first apply a random forest and logistic regression.

### Features
- use user_info.
- use actions vector. 
- try action types, action details, and actions
- also try tf-idf transformation  
- use actions_sec vector similarly to actions vector.
- the combination of above. 

### The best feature combination
(based on 5-fold CV accuracy)
- user_info
- actions
- without seconds
- without tf-idf 

### Model
- we use a logistic regression because we have high dimensional feature vectors (~350-dim).
- It would be (relatively) easier to separate linearly. 

### Advanced Models
- Once applying a logistic regression, we apply a random forest for comparision. 

Overall, we achieve around 79% of accuracy from 5-fold CV from either model,
which is a 10% increase in accuracy compared to the base accuracy 69%. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train_user_session_merged.csv')

In [3]:
df.shape

(65136, 676)

# 1. Data Setup

In [4]:
from sklearn.preprocessing import LabelEncoder

#user_info = ['gender', 'ageCat','signup_device']
user_info = ['gender', 'ageCat','signup_method', 'signup_flow',
           'language', 'affiliate_channel', 'affiliate_provider',
           'signup_app', 'first_device_type', 'first_browser','signup_device']
    
    
for col in user_info:
    df[col] = LabelEncoder().fit_transform(df[col])

target = 'country_destination'

actions = pd.read_csv('actions.csv', header=None, names=['action']).action.tolist()
actions_sec = [action+'_sec' for action in actions]

feature_columns = []
feature_columns += user_info
feature_columns += actions

X = df[feature_columns]
y = df[target]

In [5]:
X.shape

(65136, 340)

In [7]:
X.head()

Unnamed: 0,gender,ageCat,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,first_browser,...,similar_listings_v2,confirmation,signup_weibo,acculynk_load_pin_pad,acculynk_bin_check_success,acculynk_session_obtained,acculynk_pin_pad_inactive,reactivate,airbrb,desks
0,0,2,0,0,5,2,3,2,3,10,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,5,5,7,2,6,10,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,0,5,2,3,2,6,6,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,0,5,2,3,2,7,18,...,0,0,0,0,0,0,0,0,0,0
4,1,4,1,0,5,7,7,2,7,18,...,0,0,0,0,0,0,0,0,0,0


## Base Accuracy

In [8]:
y_cnt = y.value_counts()
y_cnt_percent = y_cnt/y_cnt.sum()*100
y_cnt_percent

NDF    69.149165
US     30.850835
Name: country_destination, dtype: float64

# 2. Modeling with a Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import time

def fitLR(X,y, estimator=LogisticRegression, categorical_features=[], param_grid=None, cv=2):
    t1 = time.time()
    
    X = X.copy()
    features = [col for idx, col in enumerate(X.columns) if idx not in categorical_features]
    X[features] = StandardScaler().fit_transform(X[features])
    
    X_ohe = X
    if categorical_features:
        ohe = OneHotEncoder(categorical_features=categorical_features)
        X_ohe = ohe.fit_transform(X).toarray()
    
    X_train, X_test, y_train, y_test = train_test_split(X_ohe,y, test_size=0.3)
    
    clf = estimator()
    
    if param_grid:
        gs = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=cv)
        gs = gs.fit(X_train, y_train)
        clf = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
    train_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=clf, X=X_test, y=y_test, cv=cv)
    
    print('train mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(train_scores)*100, np.std(train_scores)*100))
    print('test mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(test_scores)*100, np.std(test_scores)*100))
    
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return clf

In [10]:
fitLR(X, y,categorical_features=[0,1,2,3,4,5,6,7,8,9,10])

train mean accuracy: 78.68 % (std=0.18 %)
test mean accuracy: 78.44 % (std=0.08 %)

 total time: 60.54 sec.


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 3. Advanced Algorithms

# Random Forest

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import time


def printFeatureImportance(clf, X,y):
    print('\n feature importance:')
    clf.fit(X, y)
    features = X.columns
    importances = clf.feature_importances_
    arg_sort = np.argsort(importances)[::-1]
    n = min(len(features), 10)
    for i in range(n):
        idx = arg_sort[i]
        print('%2d. %-*s %.2f %%' %(i+1,15, features[idx],importances[idx]*100))

def fitTree(X,y, estimator=RandomForestClassifier, param_grid=None, cv=2):
    t1 = time.time()
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
    
    clf = estimator()
     
    if param_grid:
        gs = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=cv)
        gs = gs.fit(X_train, y_train)
        clf = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
    train_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=clf, X=X_test, y=y_test, cv=cv)
    
    print('train mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(train_scores)*100, np.std(train_scores)*100))
    print('test mean accuracy: %.2f %% (std=%.2f %%)' 
              %(np.mean(test_scores)*100, np.std(test_scores)*100))
    
    printFeatureImportance(clf,X,y)
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return clf

In [12]:
param_grid = [{'n_estimators':[20], 'max_features':[100], 'max_depth':[10]}]
fitTree(X,y)

train mean accuracy: 76.73 % (std=0.15 %)
test mean accuracy: 76.01 % (std=0.32 %)

 feature importance:
 1. ageCat          9.50 %
 2. gender          4.07 %
 3. show            3.80 %
 4. requested       3.53 %
 5. pending         3.09 %
 6. missing         3.00 %
 7. index           2.91 %
 8. first_browser   2.57 %
 9. personalize     2.54 %
10. ajax_refresh_subtotal 2.35 %

 total time: 6.04 sec.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
# random forest with parameter tuning
param_grid = [{'n_estimators':[20, 50], 'max_features':[50,100], 'max_depth':[10, 20]}]
fitTree(X,y, param_grid=param_grid, cv=5)

best param: {'max_features': 100, 'max_depth': 10, 'n_estimators': 20}
grid search time: 25.45 sec.
train mean accuracy: 79.95 % (std=0.33 %)
test mean accuracy: 78.97 % (std=0.53 %)

 feature importance:
 1. ageCat          24.91 %
 2. requested       15.16 %
 3. pending         10.91 %
 4. gender          6.90 %
 5. missing         6.53 %
 6. signup_method   5.66 %
 7. at_checkpoint   1.97 %
 8. verify          1.63 %
 9. complete_status 1.63 %
10. create          1.59 %

 total time: 60.38 sec.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=100, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)