<h1><center> Machine Learning </center></h1>

In [12]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib as mpl


import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import KFold



%matplotlib inline

In [2]:
train_users=pd.read_csv('data/train_users_1.csv')

### Preparing data for machine learning

In [3]:
train_users.columns

Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'weekdays_first_booking', 'year_first_booking',
       'month_first_booking', 'day_first_booking', 'day_of_year_first_booking',
       'week_of_year_first_booking', 'year_month_first_booking',
       'weekdays_first_active', 'year_first_active', 'month_first_active',
       'day_first_active', 'day_of_year_first_active',
       'week_of_year_first_active', 'hour_first_active', 'date_first_active',
       'year_month_first_active', 'weekdays_account_created',
       'year_account_created', 'month_account_created', 'day_account_created',
       'day_of_year_account_created', 'week_of_year_account_created',
       'year_month_account_created', 'days_btw_active_and_booking',
       'days_btw_creatac

Drop out features:
* containing first booking time because it is not in test set and not available in reality beforehand
* information unuseful like 'id'
* features that are linearly corralated with other features. Because I extract a lot of features from time, drop the repeated ones.
* target of course
* target properties and any column that has information from target to predict. (lesson learnt: at the beginning I kept destination descriptive features e.g. 'distance_km', I thought it may resonate with user related features to make better predictions. It turns out it contains one to one correspondence with targets.) A possible way I can think of using properties information of targets is to use them as targets separately. Maybe make predictors for each target properties, and somehow ensemble them together.

In [4]:
features=train_users.drop(['id','date_first_booking','weekdays_first_booking', 'year_first_booking',
       'month_first_booking', 'day_first_booking', 'day_of_year_first_booking',
       'week_of_year_first_booking', 'year_month_first_booking','days_btw_active_and_booking',
       'days_btw_creatacount_and_booking','booking_days_before_holiday','date_first_active', 'year_month_first_active',
       'year_month_account_created','date_account_created','timestamp_first_active','lat_destination', 'lng_destination', 'distance_km', 'destination_km2',
       'destination_language ', 'language_levenshtein_distance','language matching','country_destination'],1)

In [5]:
features.columns

Index(['gender', 'age', 'signup_method', 'signup_flow', 'language',
       'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
       'signup_app', 'first_device_type', 'first_browser',
       'weekdays_first_active', 'year_first_active', 'month_first_active',
       'day_first_active', 'day_of_year_first_active',
       'week_of_year_first_active', 'hour_first_active',
       'weekdays_account_created', 'year_account_created',
       'month_account_created', 'day_account_created',
       'day_of_year_account_created', 'week_of_year_account_created',
       'account_created_days_before_holiday',
       'first_active_days_before_holiday', 'n_devices', 'n_actions',
       'time_per_action', 'time_per_search', 'time_per_data'],
      dtype='object')

Missing values: if it is catogorical variables: replace missing values with string 'Unknown', if it is numeric: replace with mean

In [6]:
for column in features.columns[features.isnull().any()].tolist():
    if features[column].dtypes=='O':
        features[column].fillna('Unknown',inplace=True)
    else:
        features[column].fillna(features[column].mean(),inplace=True)

one-hot encoding for catogorical features

In [7]:
a=features.dtypes=='O'
dv = DictVectorizer(sparse=False) 
set1=dv.fit_transform(features[features.columns[a]].to_dict( orient = 'records' ))
set1=pd.DataFrame(set1, columns=dv.feature_names_)

ohe=preprocessing.OneHotEncoder(sparse=False)
set2=ohe.fit_transform(features['signup_flow'].values.reshape(-1, 1))
set2=pd.DataFrame(set2,columns=['signup_flow='+ str(i) for i in sorted(features['signup_flow'].unique())])

features=features.drop(np.append(features.columns[a],'signup_flow'),axis=1).join(set1).join(set2)

#standardize featurs
features=pd.DataFrame(preprocessing.StandardScaler().fit_transform(features),columns=features.columns)
features.head()

Unnamed: 0,age,weekdays_first_active,year_first_active,month_first_active,day_first_active,day_of_year_first_active,week_of_year_first_active,hour_first_active,weekdays_account_created,year_account_created,...,signup_flow=1,signup_flow=2,signup_flow=3,signup_flow=6,signup_flow=8,signup_flow=12,signup_flow=23,signup_flow=24,signup_flow=25,signup_flow=30
0,2.043535,-0.904128,-3.976366,0.001182,-0.775759,-0.071954,-0.019628,1.339496,-0.90428,-2.967228,...,-0.04364,-0.207764,3.851495,-0.042989,-0.035356,-0.188957,-0.085352,-0.131664,-0.215852,-0.032706
1,0.603493,1.153089,-3.976366,1.248487,1.741784,1.399627,1.420988,-0.778258,-1.418595,-1.954672,...,-0.04364,-0.207764,-0.259639,-0.042989,-0.035356,-0.188957,-0.085352,-0.131664,-0.215852,-0.032706
2,0.500632,-0.904128,-3.976366,1.87214,-0.890193,1.787961,1.853173,-0.778258,-0.90428,-2.967228,...,-0.04364,-0.207764,-0.259639,-0.042989,-0.035356,-0.188957,-0.085352,-0.131664,-0.215852,-0.032706
3,0.0,0.638785,-2.964459,-1.55795,-1.691229,-1.696825,2.069265,1.090348,0.638665,-2.967228,...,-0.04364,-0.207764,-0.259639,-0.042989,-0.035356,-0.188957,-0.085352,-0.131664,-0.215852,-0.032706
4,1.014933,1.153089,-2.964459,-1.55795,-1.576795,-1.686606,2.069265,-1.401127,1.15298,-2.967228,...,-0.04364,-0.207764,-0.259639,-0.042989,-0.035356,-0.188957,-0.085352,-0.131664,-0.215852,-0.032706


Do label encoding on target

In [8]:
le = preprocessing.LabelEncoder()
target = le.fit_transform(train_users['country_destination'])
classes_N=len(le.classes_)
list(zip(le.classes_,le.transform(le.classes_)))

[('AU', 0),
 ('CA', 1),
 ('DE', 2),
 ('ES', 3),
 ('FR', 4),
 ('GB', 5),
 ('IT', 6),
 ('NL', 7),
 ('PT', 8),
 ('US', 9),
 ('other', 10)]

Train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

look at label frequency in train and test sets to ensure proportional split

In [10]:
pd.Series(y_train).value_counts().sort_index(), pd.Series(y_test).value_counts().sort_index()*4

(0       429
 1      1161
 2       833
 3      1807
 4      4020
 5      1849
 6      2266
 7       601
 8       171
 9     49931
 10     8058
 dtype: int64, 0       440
 1      1068
 2       912
 3      1768
 4      4012
 5      1900
 6      2276
 7       644
 8       184
 9     49780
 10     8144
 dtype: int64)

### Logistic Regression

First step attemp, fit a model using all train data, get score in both train and test

In [18]:
def scores(clf):
    pred_train=clf.predict(X_train)
    pred_test=clf.predict(X_test)

    cfm_train=metrics.confusion_matrix(y_train, pred_train)
    acc_train= metrics.accuracy_score(y_train, pred_train)
    log_loss_train=metrics.log_loss(y_train, clf.predict_proba(X_train))

    cfm_test=metrics.confusion_matrix(y_test, pred_test)
    acc_test= metrics.accuracy_score(y_test, pred_test)
    log_loss_test=metrics.log_loss(y_test, clf.predict_proba(X_test))

    print('train\n','accuracy\t',acc_train,'\n','log_loss\t',log_loss_train,'\n','confusion matrix\n',cfm_train)
    print('test\n','accuracy\t',acc_test,'\n','log_loss\t',log_loss_test,'\n','confusion matrix\n',cfm_test)


# using a dummy predictor to predict all to be major class 'US'. simulate prediction and pred_proba
pred_dummy=np.full(len(y_test),9)
pred_proba_dummy=np.zeros((len(y_test),classes_N))
pred_proba_dummy[:,9]=1

cfm_test_dummy=metrics.confusion_matrix(y_test, pred_dummy)
acc_test_dummy= metrics.accuracy_score(y_test, pred_dummy)
log_loss_test_dummy=metrics.log_loss(y_test, pred_proba_dummy)

print('test_dummy\n','accuracy\t',acc_test_dummy,'log_loss\t',log_loss_test_dummy,'\n','confusion matrix\n',cfm_test_dummy)

test_dummy
 accuracy	 0.699865032055 log_loss	 10.3662945461 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0     0     0     0     0     0     0     0   267     0]
 [    0     0     0     0     0     0     0     0     0   228     0]
 [    0     0     0     0     0     0     0     0     0   442     0]
 [    0     0     0     0     0     0     0     0     0  1003     0]
 [    0     0     0     0     0     0     0     0     0   475     0]
 [    0     0     0     0     0     0     0     0     0   569     0]
 [    0     0     0     0     0     0     0     0     0   161     0]
 [    0     0     0     0     0     0     0     0     0    46     0]
 [    0     0     0     0     0     0     0     0     0 12445     0]
 [    0     0     0     0     0     0     0     0     0  2036     0]]


In [11]:
logit = LogisticRegression(multi_class='ovr',n_jobs=-1)
logit.fit(X_train, y_train)
scores(logit)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

We can see from the confusion matrix of this initial fit that 
* minor classes are almost all predicted to be 'US'. The accuracy of prediction in test set is as worse as dummy predictor which predict all labels to be 'US', Try using class_weight=balanced later
* log loss is much smaller for the logistic regression predictor compared to dummy predictor because it is predicting probability. So use log loss as metrics in this modeling
* the model is high bias and low variance, could add more complexity to the model

In [13]:
# try use balanced weight
logit = LogisticRegression(multi_class='ovr',class_weight='balanced',n_jobs=-1)
logit.fit(X_train, y_train)
scores(logit)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

skewed classes get considered, but log_loss and accuracy are worse

In [35]:
# try increasing C to decrease regularization, 
logit = LogisticRegression(multi_class='ovr',C=30,n_jobs=-1)
logit.fit(X_train, y_train)
scores(logit)

LogisticRegression(C=30, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

bias is not reduced much, should try less regularization

Use log loss instead of accuracy as the metrics. Search through grid search.

In [20]:
def search_model(x_train, y_train, est, param_grid, cv, scorer=None):
    model = GridSearchCV(estimator  = est,
                         param_grid = param_grid,
                         cv      = cv,
                         scoring    = scorer)
    model.fit(x_train, y_train)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)  
    print ('cv_results_:', model.cv_results_)
    return model

In [23]:
logit_params = {'C': [1,30,100,300], 
               'class_weight':[None,'balanced']
               }
logit = LogisticRegression(multi_class='ovr',n_jobs=-1)

%time gds_log_1 = search_model(X_train, y_train,logit,logit_params,3,'neg_log_loss')

KeyboardInterrupt: 

### random forest

In [21]:
rf=RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)
scores(rf)

train
 accuracy	 0.977828079746 log_loss	 0.226585646672 
 confusion matrix
 [[  401     0     0     0     1     0     0     0     0    27     0]
 [    0  1088     0     0     0     0     0     0     0    73     0]
 [    0     0   795     0     0     0     0     0     0    38     0]
 [    0     0     1  1696     0     0     1     0     0   108     1]
 [    0     2     2     0  3802     0     0     0     0   208     6]
 [    0     0     0     0     1  1739     0     0     0   108     1]
 [    0     0     0     0     2     1  2144     0     0   118     1]
 [    0     0     0     1     0     0     0   568     0    32     0]
 [    0     0     0     0     0     0     0     0   161    10     0]
 [    1     1     2     3     9     2     9     1     0 49889    14]
 [    1     1     1     0     4     0     0     0     0   785  7266]]
test
 accuracy	 0.673208862895 log_loss	 6.43531135268 
 confusion matrix
 [[    0     0     0     0     1     0     1     1     0   106     1]
 [    0     1     1

Compared to logistic, this random forest estimator overfit. It's scores are good in train set, and worse than logistic regression in test set. The goal is to reduce overfit in random forest. Use hyperparameters. 

In [22]:
rf=RandomForestClassifier(max_depth=10,n_jobs=-1)
rf.fit(X_train, y_train)
scores(rf)

train
 accuracy	 0.7021483002 log_loss	 1.09213933203 
 confusion matrix
 [[    1     0     0     0     0     0     0     0     0   428     0]
 [    0     0     0     0     0     0     0     0     0  1161     0]
 [    0     0     1     0     0     0     0     0     0   832     0]
 [    0     0     0     0     0     0     0     0     0  1807     0]
 [    0     0     0     0     4     0     0     0     0  4016     0]
 [    0     0     0     0     0     1     0     0     0  1848     0]
 [    0     0     0     0     0     0     0     0     0  2266     0]
 [    0     0     0     0     0     0     0     0     0   601     0]
 [    0     0     0     0     0     0     0     0     0   171     0]
 [    0     0     0     0     0     0     0     0     0 49931     0]
 [    0     0     0     0     0     0     0     0     0  8055     3]]
test
 accuracy	 0.699865032055 log_loss	 1.16167894076 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0     0   

In [26]:
rf=RandomForestClassifier(max_depth=20,n_jobs=-1)
rf.fit(X_train, y_train)
scores(rf)

train
 accuracy	 0.753142310829 log_loss	 0.675431685301 
 confusion matrix
 [[   67     0     0     0     0     0     0     0     0   362     0]
 [    0   169     0     0     0     0     0     0     0   991     1]
 [    0     0   126     0     1     0     0     0     0   706     0]
 [    0     0     0   303     1     0     0     0     0  1502     1]
 [    0     0     0     0   667     0     0     0     0  3352     1]
 [    0     0     0     0     0   280     0     0     0  1568     1]
 [    0     0     0     0     0     0   347     0     0  1917     2]
 [    0     0     0     0     0     0     0    93     0   508     0]
 [    0     0     0     0     0     0     0     0    18   152     1]
 [    0     0     0     0     1     0     0     0     0 49928     2]
 [    0     0     0     0     0     0     0     0     0  6488  1570]]
test
 accuracy	 0.697728039591 log_loss	 1.32537167122 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0     0

In [30]:
rf=RandomForestClassifier(min_samples_split=10,n_jobs=-1)
rf.fit(X_train, y_train)
scores(rf)

train
 accuracy	 0.745536090881 log_loss	 0.605181832827 
 confusion matrix
 [[   23     0     0     0     2     0     0     0     0   403     1]
 [    0    90     0     0     1     0     0     0     0  1059    11]
 [    0     0    47     0     0     0     0     0     0   777     9]
 [    0     0     0   174     2     0     0     0     0  1617    14]
 [    0     0     0     0   520     1     0     0     0  3477    22]
 [    0     0     0     0     2   128     0     0     0  1713     6]
 [    0     0     0     0     1     1   237     0     0  2016    11]
 [    0     0     0     0     0     0     0    38     0   560     3]
 [    0     0     0     0     0     0     0     0     8   161     2]
 [    0     0     0     0     0     0     0     0     0 49926     5]
 [    0     0     0     0     4     1     1     0     0  6216  1836]]
test
 accuracy	 0.697615566303 log_loss	 2.85655131072 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0     0

In [14]:
rf=RandomForestClassifier(n_estimators=200,n_jobs=-1)
rf.fit(X_train, y_train)
scores(rf)

train
 accuracy	 0.999184545736 
 log_loss	 0.212600022087 
 confusion matrix
 [[  428     0     0     0     0     0     0     0     0     1     0]
 [    0  1160     0     0     0     0     0     0     0     1     0]
 [    0     0   832     0     0     0     0     0     0     1     0]
 [    0     0     1  1804     0     0     1     0     0     1     0]
 [    0     0     0     0  4008     0     0     0     0     8     4]
 [    0     0     0     0     0  1842     1     0     0     6     0]
 [    0     0     0     0     0     0  2262     0     0     4     0]
 [    0     0     0     0     0     0     0   600     0     1     0]
 [    0     0     0     0     0     0     0     0   171     0     0]
 [    0     0     0     0     1     0     0     0     0 49929     1]
 [    0     0     0     1     2     0     0     0     0    23  8032]]
test
 accuracy	 0.692779214936 
 log_loss	 1.78206763745 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0  

increase tree numbers decrease both bias and variance: unambiguous better.

In [22]:
forest_params = {'max_depth':[8,16,24,32],
                 'max_features':[10,20,30],
                 'min_samples_leaf': [5,10,15],                 
                 'class_weight':[None,'balanced']} 
rf=RandomForestClassifier(n_estimators=500,n_jobs=-1) 
%time gds_rf_1 = search_model(X_train, y_train,rf,forest_params,3,'neg_log_loss')

KeyboardInterrupt: 

an even complicated model: xg boost

In [15]:
xg=xgb.XGBClassifier(objective='multi:softmax')
xg.fit(X_train, y_train)
scores(xg)

train
 accuracy	 0.702007704637 
 log_loss	 1.12083670078 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   429     0]
 [    0     0     0     0     0     0     0     0     0  1161     0]
 [    0     0     0     0     0     0     0     0     0   833     0]
 [    0     0     0     0     0     0     0     0     0  1807     0]
 [    0     0     0     0     0     0     0     0     0  4020     0]
 [    0     0     0     0     0     0     0     0     0  1849     0]
 [    0     0     0     0     0     0     0     0     0  2266     0]
 [    0     0     0     0     0     0     0     0     0   601     0]
 [    0     0     0     0     0     0     0     0     0   171     0]
 [    0     0     0     0     0     0     0     0     0 49931     0]
 [    0     0     0     0     0     0     0     0     0  8058     0]]
test
 accuracy	 0.699865032055 
 log_loss	 1.14870652606 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0   

In [17]:
xg=xgb.XGBClassifier(objective='multi:softmax', max_depth=6)
xg.fit(X_train, y_train)
scores(xg)

train
 accuracy	 0.704327531423 
 log_loss	 1.04142394802 
 confusion matrix
 [[    5     0     0     0     0     0     0     0     0   424     0]
 [    0     3     0     0     0     0     0     0     0  1158     0]
 [    0     0    13     0     0     0     0     0     0   820     0]
 [    0     0     0    17     0     0     0     0     0  1790     0]
 [    0     0     0     0    25     0     0     0     0  3994     1]
 [    0     0     0     0     0     6     0     0     0  1843     0]
 [    0     0     0     0     0     0    12     0     0  2252     2]
 [    0     0     0     0     0     0     0     5     0   595     1]
 [    0     0     0     0     0     0     0     0     3   168     0]
 [    0     0     0     0     0     0     0     0     0 49931     0]
 [    0     0     0     0     0     0     1     0     0  7981    76]]
test
 accuracy	 0.700146215274 
 log_loss	 1.15034532075 
 confusion matrix
 [[    0     0     0     0     0     0     0     0     0   110     0]
 [    0     0   