## Next: Prepare a learning model to make predictions.
    

## Prerequisite 
### Note: Install xgboostclassifier on your laptop or lab machine by typing following command:

   $ conda install py-xgboost

   Test in the cell of ipynb by typing and running:  import xgboost as xgb  
   If this doesn't throw error it works fine (warnings can be omitted)
   
   xgboost (https://xgboost.readthedocs.io/en/latest/)
   

## Run Lab 01 and 02 code

In [1]:
#Loading the data
import pandas as pd
import numpy as np

print("Reading data...")
train_file = "./traveler/train_users_2.csv"
df_train = pd.read_csv(train_file, header = 0,index_col=None)

test_file = "./traveler/test_users.csv"
df_test = pd.read_csv(test_file, header = 0,index_col=None)

# Combining into one dataset for cleaning
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print("Reading data...completed")
# Fixing date formats in Pandas - to_datetime
## Change dates to specific format
print("Fixing timestamps...")
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'], format='%Y-%m-%d')
df_all['timestamp_first_active'] = pd.to_datetime(df_all['timestamp_first_active'], format='%Y%m%d%H%M%S')
print("Fixing timestamps...completed")
# Removing date_first_booking column
df_all.drop('date_first_booking', axis = 1, inplace = True)
print("Droped date_first_booking column...")

## Remove outliers function - [1]
def remove_outliers(df, column, min_val, max_val):
    col_values = df[column].values
    df[column] = np.where(np.logical_or(col_values<=min_val, col_values>=max_val), np.NaN, col_values)
    return df

## Fixing age column - [2]
print("Fixing age column...")
df_all = remove_outliers(df = df_all, column = 'age', min_val = 15, max_val = 90)
df_all['age'].fillna(-1, inplace = True)
print("Fixing age column...completed")
# Fill first_affiliate_tracked column
print("Filling first_affiliate_tracked column...")
df_all['first_affiliate_tracked'].fillna(-1, inplace=True)
print("Filling first_affiliate_tracked column...completed")

def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

# One Hot Encoding
print("One Hot Encoding categorical data...")
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for column in columns_to_convert:
    df_all = convert_to_binary(df=df_all, column_to_convert=column)
    df_all.drop(column, axis=1, inplace=True)
print("One Hot Encoding categorical data...completed")

# Add new date related fields
print("Adding new fields...")
df_all['day_account_created'] = df_all['date_account_created'].dt.weekday
df_all['month_account_created'] = df_all['date_account_created'].dt.month
df_all['quarter_account_created'] = df_all['date_account_created'].dt.quarter
df_all['year_account_created'] = df_all['date_account_created'].dt.year
df_all['hour_first_active'] = df_all['timestamp_first_active'].dt.hour
df_all['day_first_active'] = df_all['timestamp_first_active'].dt.weekday
df_all['month_first_active'] = df_all['timestamp_first_active'].dt.month
df_all['quarter_first_active'] = df_all['timestamp_first_active'].dt.quarter
df_all['year_first_active'] = df_all['timestamp_first_active'].dt.year
df_all['created_less_active'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).dt.days
print("Adding new fields...completed")


# Drop unnecessary columns
print("Droping fields...")
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
    if column in df_all.columns:
        df_all.drop(column, axis=1, inplace=True)
print("Droping fields...completed")

## Loading sessions.csv data
print("Reading sessions data...")
sessions_file = "./traveler/sessions.csv"
df_sessions = pd.read_csv(sessions_file, header = 0,index_col=False)
print("Reading sessions data...completed")

# Determine primary device
print("Determing primary device...")
sessions_device = df_sessions.loc[:, ['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).aggregate(np.sum)
#aggregated_lvl1.head(10)
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
#idx.head(10)
df_sessions_primary = pd.DataFrame(aggregated_lvl1.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
#df_sessions_primary.head(10)
df_sessions_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
#df_sessions_primary.head(10)
# Call user defined One Hot Encoding function
df_sessions_primary = convert_to_binary(df=df_sessions_primary, column_to_convert='primary_device')
#df_sessions_primary.head()
df_sessions_primary.drop('primary_device', axis=1, inplace=True)
#df_sessions_primary.head()
print("Determing primary device...completed")

# Determine Secondary device
print("Determing secondary device...")
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
remaining.head()
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_sessions_secondary = pd.DataFrame(remaining.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_sessions_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_sessions_secondary = convert_to_binary(df=df_sessions_secondary, column_to_convert='secondary_device')
df_sessions_secondary.drop('secondary_device', axis=1, inplace=True)
print("Determing secondary device...completed")

# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
    id_list = df[id_col].drop_duplicates()

    df_counts = df.loc[:,[id_col, column_to_convert]]
    df_counts['count'] = 1
    df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()

    new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
    new_df = new_df.fillna(0)

# Rename Columns
    categories = list(df[column_to_convert].drop_duplicates())
    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert + '_' + cat_name
        new_df.rename(columns = {category:col_name}, inplace=True)

    return new_df

# Aggregate and combine actions taken columns
print("Aggregating actions taken...")
session_actions = df_sessions.loc[:,['user_id', 'action', 'action_type', 'action_detail']]
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True

for column in columns_to_convert:
    print("Converting " + column + " column...")
    current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)

# If first loop, current data becomes existing data, otherwise merge existing and current
    if first:
        first = False
        actions_data = current_data
    else:
        actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')


# [4.1] Merge device datasets
print("Combining results...")
df_sessions_primary.set_index('user_id', inplace=True)
df_sessions_secondary.set_index('user_id', inplace=True)
device_data = pd.concat([df_sessions_primary, df_sessions_secondary], axis=1, join="outer")

# [4.2] Merge device and actions datasets
combined_results = pd.concat([device_data, actions_data], axis=1, join='outer')
df_sessions_complete = combined_results.fillna(0)

# [4.3] Merge user and session datasets
df_all.set_index('id', inplace=True)
df_all = pd.concat([df_all, df_sessions_complete], axis=1, join='inner')
print("Combining results...completed")


Reading data...
Reading data...completed
Fixing timestamps...
Fixing timestamps...completed
Droped date_first_booking column...
Fixing age column...
Fixing age column...completed
Filling first_affiliate_tracked column...
Filling first_affiliate_tracked column...completed
One Hot Encoding categorical data...
One Hot Encoding categorical data...completed
Adding new fields...
Adding new fields...completed
Droping fields...
Droping fields...completed
Reading sessions data...
Reading sessions data...completed
Determing primary device...
Determing primary device...completed
Determing secondary device...
Determing secondary device...completed
Aggregating actions taken...
Converting action column...
Converting action_type column...
Converting action_detail column...
Combining results...
Combining results...completed


In [2]:
df_all.head()

Unnamed: 0,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,signu_0,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
00023iyk9l,31,1,0,0,0,0,1,0,0,1,...,0,5,0,0,0,4,0,0,0,2
0010k6l0om,-1,1,0,0,0,0,1,0,0,1,...,0,10,0,0,0,8,0,0,0,0
001wyh0pz8,-1,1,0,0,0,0,1,0,0,0,...,0,66,0,0,0,0,0,0,0,0
0028jgx1x1,-1,1,0,0,0,0,1,0,0,1,...,0,9,0,0,0,0,0,0,0,0
002qnbzfs5,26,0,0,1,0,1,0,0,0,0,...,0,125,0,0,0,0,0,0,0,0


In [4]:
df_train1 = df_train
df_test1 = df_test
df_all1 = df_all

In [12]:
df_train1.columns

Index([u'id', u'date_account_created', u'timestamp_first_active',
       u'date_first_booking', u'gender', u'age', u'signup_method',
       u'signup_flow', u'language', u'affiliate_channel',
       u'affiliate_provider', u'first_affiliate_tracked', u'signup_app',
       u'first_device_type', u'first_browser', u'country_destination'],
      dtype='object')

In [13]:
df_train['country_destination'].head()

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object

In [14]:
df_test1.columns

Index([u'id', u'date_account_created', u'timestamp_first_active',
       u'date_first_booking', u'gender', u'age', u'signup_method',
       u'signup_flow', u'language', u'affiliate_channel',
       u'affiliate_provider', u'first_affiliate_tracked', u'signup_app',
       u'first_device_type', u'first_browser'],
      dtype='object')

In [15]:
df_all.columns

Index([u'age', u'gende_unknown', u'gende_male', u'gende_female',
       u'gende_other', u'signu_facebook', u'signu_basic', u'signu_google',
       u'signu_weibo', u'signu_0',
       ...
       u'action_detail_view_resolutions', u'action_detail_view_search_results',
       u'action_detail_view_security_checks',
       u'action_detail_view_user_real_names', u'action_detail_wishlist',
       u'action_detail_wishlist_content_update',
       u'action_detail_wishlist_note', u'action_detail_your_listings',
       u'action_detail_your_reservations', u'action_detail_your_trips'],
      dtype='object', length=720)

## Milestone: Creating a learning model

##### To predict the first booking destination country for each user based on the dataset created in earlier lab sessions.

### Choosing an Algorithm (most crusual)

### [1] Decision Tree - refer Applied Statistics notes
Biggest problem: model overfitting

Solution parameters setting: Stop model splitting once the records at a given node gets too small (minimum split) and when a certain number of splits have occurred (maximum depth).

The problem is 
- how do you know how large you should grow the tree? 
- how do you set the parameters to avoid overfitting but still have an accurate model? 

Reality is that it is extremely difficult to know how to set the parameters. 
- Set them too conservatively and the model will lose too much predictive power. 
- Set them too aggressively and the model will start overfitting the data.

BEST PART - methods have been found to reduce the risk of overfitting and increase predictive power of decisions trees, to train multiple trees (random forest, boosting)

[1] The 'random forest' algorithm constructs a large number of different trees by randomly selecting the features that can be used to build each tree (as opposed to using all the features for each tree).

[2] 'Boosting' algorithm which builds trees iteratively such that each tree learns from earlier trees. We focus on very popular 'XGBoost' algorithm.

### [2] Alternative Models
#### [2.1] K-Nearest Neighbors
Classifies a given object by looking at the classification of the k most similar records and seeing how those records are classified. Also known as lazy learner.
#### [2.2] Neural Networks
Typicaly consists of three layers: an input layer, a hidden layer (although there can be multiple hidden layers) and an output layer.
#### [2.3] Support Vector Machines
Classifier which separates classes using kernel trick.
    

### Approach for creating the model using XGBoost algorithm
#### [1.1] k-fold Cross Validation
##### Why? 
- one of the key risks when creating models is the risk of overfitting.
- to guard against overfitting is to estimate the accuracy of the models on data that was not used to train the model i.e., using cross-validation method (different CV methods - https://www.cs.cmu.edu/~schneide/tut5/node42.html)

##### How?
<img src="./images/cross-validation.png" height="400" width="500"/>
#### [1.2] Parameter Tuning
##### Why? 
- Parameter options: How many trees to build? How deep should each tree be? How much extra weight will be attached to each misclassified record? 
- Tuning these parameters to get the best results from the model is often one of the most time consuming things that data scientists do.

##### How?
- However, the process can be automated.

### Even better, using the 'Scikit-Learn' package, 
- merge the parameter tuning and cross validation steps into one, allowing to search for the best combination of parameters while using k-fold cross validation to verify the results.


#### [1.3] Training the Model
   ##### First, define training dataset and split the training data into the three main components – 
  (i) the user IDs (we don’t want to use these for training as they are randomly generated),
  
  (ii) the features to use for training (X), and 
  
  (iii) the categories we are trying to predict (y)

In [5]:
#import xgboost as xgb

#from xgboost.sklearn import XGBClassifier
#from sklearn import cross_validation, decomposition, grid_search
#from sklearn.preprocessing import LabelEncoder

# Prepare training data for modelling
df_train1.set_index('id', inplace=True)
df_train1 = pd.concat([df_train1['country_destination'], 
                       df_all1], axis=1, join='inner')
#df_train2 = df_train1



In [7]:
df_train1.shape

(73815, 721)

In [8]:
df_train1.head()

Unnamed: 0,country_destination,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
00023iyk9l,US,31,1,0,0,0,0,1,0,0,...,0,5,0,0,0,4,0,0,0,2
001wyh0pz8,NDF,-1,1,0,0,0,0,1,0,0,...,0,66,0,0,0,0,0,0,0,0
0028jgx1x1,NDF,-1,1,0,0,0,0,1,0,0,...,0,9,0,0,0,0,0,0,0,0
002qnbzfs5,US,26,0,0,1,0,1,0,0,0,...,0,125,0,0,0,0,0,0,0,0
0035hobuyj,US,-1,0,0,1,0,0,1,0,0,...,0,200,0,0,0,26,0,0,0,0


In [10]:
from sklearn.preprocessing import LabelEncoder

id_train = df_train1.index.values
labels = df_train1['country_destination']

# Label encoding for the categorical data eg: ...NDF -> 7, US -> 10...
le = LabelEncoder()
y = le.fit_transform(labels)
X = df_train1.drop('country_destination', axis=1, inplace=False)

In [11]:
X.shape

(73815, 720)

In [12]:
X.head()

Unnamed: 0,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,signu_0,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
00023iyk9l,31,1,0,0,0,0,1,0,0,1,...,0,5,0,0,0,4,0,0,0,2
001wyh0pz8,-1,1,0,0,0,0,1,0,0,0,...,0,66,0,0,0,0,0,0,0,0
0028jgx1x1,-1,1,0,0,0,0,1,0,0,1,...,0,9,0,0,0,0,0,0,0,0
002qnbzfs5,26,0,0,1,0,1,0,0,0,0,...,0,125,0,0,0,0,0,0,0,0
0035hobuyj,-1,0,0,1,0,0,1,0,0,1,...,0,200,0,0,0,26,0,0,0,0


## APPROACH 01:

In [14]:
from sklearn import cross_validation
## Spliting of training dataset into 70% training data and 30% testing data randomly
features_train, features_test, labels_train, labels_test = 
cross_validation.train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
## Decision Tree 
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
## Computing accuracy
from sklearn.metrics import accuracy_score
print accuracy_score(prediction, labels_test)

0.574125084669


In [16]:
## Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
## Computing accuracy
print accuracy_score(prediction, labels_test)

0.598961390833


In [None]:
## SVM 
from sklearn import svm
clf = svm.SVC(kernel="rbf") 
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
## Computing accuracy
print accuracy_score(prediction, labels_test)

## APPROACH 02:


### Dataset creation
#### For the sake of understanding the behaviour of learning models, the training dataset is split into -
##### - training set: (X_trainA, y_trainA)
##### - validation set: (X_validA, y_validA)
##### - test set: (X_testA, y_testA)

### Learning architecture
#### First layer: I am using 6 classifiers from scikit-learn (Support_Vector_Machines, Logistic_Regression, Random_Forest, Gradient_Boosting, Extra_Trees_Classifier, K_Nearest_Neighbors). All classifiers are used with (almost) default parameters. At this level, many other classifiers can be used. All classifiers are applied twice:

   ##### Classifiers are trained on (X_train, y_train) and used to predict the class probabilities of (X_valid).
   ##### Classifiers are trained on (X = (X_train + X_valid), y = (y_train + y_valid)) and used to predict the class probabilities of (X_test)

#### Second layer: The predictions from the previous layer on X_valid are concatenated and used to create a new training set (XV, y_valid). The predictions on X_test are concatenated to create a new test set (XT, y_test). 

In [6]:
## Importing the classifier libraries
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from xgboost.sklearn import XGBClassifier


### Generating dataset
#### Parameters can be changed to explore different types of data.


In [7]:
#Spliting data into train and test sets.
XA, X_testA, yA, y_testA = train_test_split(X, y, test_size=0.2, 
                                        random_state=random_state)
    
#Spliting train data into training and validation sets.
X_trainA, X_validA, y_trainA, y_validA = train_test_split(XA, yA, test_size=0.25, 
                                                      random_state=random_state)

print('Data shape:')
print('X_trainA: %s, X_validA: %s, X_testA: %s \n' %(X_trainA.shape, X_validA.shape, 
                                                     X_testA.shape))

Data shape:
X_trainA: (44289, 349), X_validA: (14763, 349), X_testA: (14763, 349) 



### First layer (individual classifiers)
#### All classifiers are applied twice:
##### - Training on (X_trainA, y_trainA) and predicting on (X_validA)
##### - Training on (XA, yA) and predicting on (X_testA)
##### You can add / remove classifiers or change parameter values to see the effect on final results.

In [None]:
from sklearn.metrics import log_loss
#Defining the classifiers
clfs = {'LR'  : LogisticRegression(random_state=random_state), 
        'SVM' : SVC(probability=True, random_state=random_state), 
        'RF'  : RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                       random_state=random_state), 
        'GBM' : GradientBoostingClassifier(n_estimators=50, 
                                           random_state=random_state), 
        'ETC' : ExtraTreesClassifier(n_estimators=100, n_jobs=-1, 
                                     random_state=random_state),
        'KNN' : KNeighborsClassifier(n_neighbors=30)}
    
#predictions on the validation and test sets
p_valid = []
p_test = []
   
print('Performance of individual classifiers (1st layer) on X_testA')   
print('------------------------------------------------------------')
   
for nm, clf in clfs.items():
    #First run. Training on (X_trainA, y_trainA) and predicting on X_validA.
    clf.fit(X_trainA, y_trainA)
    yv = clf.predict_proba(X_validA)
    p_valid.append(yv)
        
    #Second run. Training on (XA, yA) and predicting on X_testA.
    clf.fit(XA, yA)
    yt = clf.predict_proba(X_testA)
    p_test.append(yt)
       
    #Printing out the performance of the classifier
    print('{:10s} {:2s} {:1.7f}'.format('%s: ' %(nm), 'logloss  =>', log_loss(y_testA, yt)))
print('')

Performance of individual classifiers (1st layer) on X_testA
------------------------------------------------------------
KNN:       logloss  => 2.7331216

### Comparison with sklearn LogisticRegression -
##### Predictions on X_validA are used as training set (XV) and predictions on X_testA are used as test set (XT).
#####  Setting the multi-class logloss as objective function. 


In [None]:
#Creating the data for the 2nd layer.
XV = np.hstack(p_valid)
XT = np.hstack(p_test)

#By default the best C parameter is obtained with a cross-validation approach, doing grid search with
#10 values defined in a logarithmic scale between 1e-4 and 1e4.
#Change parameters to see how they affect the final results.
lr = LogisticRegressionCV(Cs=10, dual=False, fit_intercept=True, 
                          intercept_scaling=1.0, max_iter=25,
                          multi_class='ovr', n_jobs=1, penalty='l2', 
                          random_state=random_state,
                          solver='lbfgs', tol=0.0001)

lr.fit(XV, y_validA)
y_lr = lr.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('Log_Reg:', 'logloss  =>', log_loss(y_testA, y_lr)))

In [None]:
#Gradient boosting
xgb = XGBClassifier(max_depth=5, learning_rate=0.1,
                    n_estimators=10000, objective='multi:softprob', 
                    seed=random_state)
xgb.fit(XV, y_validA, early_stopping_rounds=15, verbose=False)
xgb.n_estimators = xgb.best_iteration
xgb.fit(XV, y_validA)
y_gb = xgb.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('XGB_Reg:', 'logloss  =>', log_loss(y_testA, y_gb)))

#### HW - Is there any parameters configuration for LogisticRegression that produces better results?

### APPROACH 03:

#### [1.4] GridSearchCV
##### - Training data is ready in [1.3]
##### - Now, use GridSearchCV to run the algorithm with a range of parameters, 
##### - Then select the model that has the highest cross validated score based on the chosen measure of a performance, in this case accuracy is considered (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html), but there are a range of metrics(http://scikit-learn.org/stable/modules/model_evaluation.html) we could use based on our needs.


In [49]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import decomposition, grid_search

# Grid Search - Used to find best combination of parameters
XGB_model = xgb.XGBClassifier(objective='multi:softprob', 
                              subsample=0.5, colsample_bytree=0.5, 
                              seed=0)
param_grid = {'max_depth': [5], 'learning_rate': [0.1], 
              'n_estimators': [5]}
#param_grid = {'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.3], 'n_estimators': [25, 50]} ##Note running this step can take a significant amount of time, might take hours as well.
model = grid_search.GridSearchCV(estimator=XGB_model, param_grid=param_grid,
                                 scoring='accuracy', verbose=10, n_jobs=1, 
                                 iid=True, refit=True, cv=3)

#model.fit(X, y)
model.fit(features_train, labels_train)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=5, learning_rate=0.1, max_depth=5 ..................
[CV]  n_estimators=5, learning_rate=0.1, max_depth=5, score=0.683230 -  57.6s

[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   57.7s



[CV] n_estimators=5, learning_rate=0.1, max_depth=5 ..................
[CV]  n_estimators=5, learning_rate=0.1, max_depth=5, score=0.693898 -  53.8s
[CV] n_estimators=5, learning_rate=0.1, max_depth=5 ..................




[CV]  n_estimators=5, learning_rate=0.1, max_depth=5, score=0.689257 -  54.0s
Best score: 0.689
Best parameters set:
	learning_rate: 0.1
	max_depth: 5
	n_estimators: 5


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.8min finished


In [56]:
from sklearn.metrics import log_loss
#Gradient boosting
#xgb = XGBClassifier(max_depth=5, learning_rate=0.1,
#                    n_estimators=5, objective='multi:softprob', 
#                    seed=0)
#xgb.fit(features_train, labels_train, verbose=False)
y_gb = model.predict_proba(features_test)
print('{:20s} {:2s} {:1.7f}'.format('XGB_Reg:', 'logloss  =>', 
                                    log_loss(labels_test, y_gb)))

XGB_Reg:             logloss  => 1.6222701




#### XGBClassifier Parameters -

##### objective='multi:softprob' (http://xgboost.readthedocs.io/en/latest/parameter.html) --> Specify the learning task and the corresponding learning objective

##### subsample=0.5 [default=1] --> subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.

##### colsample_bytree=0.5 [default=1] --> subsample ratio of columns when constructing each tree.

##### max_depth [default=6] --> maximum depth of a tree, increase this value will make the model more complex / likely to be overfitting. 

##### n_estimators --> Number of boosted trees to fit.


### [1.5] Making the Predictions

- Now that we have trained the model based on the best parameters,
- Next step is to use the model to make predictions for the testing dataset.

In [50]:
from sklearn.metrics import roc_auc_score, accuracy_score
# Make predictions
y_pred = model.predict(features_test)
y_pred_prob = model.predict_proba(features_test) ##select the 5 best predictions

#Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % accuracy_score(labels_test, y_pred)
#print "AUC Score (Train): %f" % roc_auc_score(labels_test, y_pred_prob)
                    


Model Report
Accuracy : 0.69


In [51]:
y_pred.shape

(22145L,)

#### [1.5.1] Extracting the testing data out of the combined dataset (df_all) we created for the cleaning and transformation steps.

In [45]:
# Prepare test data for prediction
df_test1.set_index('id', inplace=True)
df_test1 = pd.merge(df_test1.loc[:,['date_first_booking']], df_all1, how='left', left_index=True, right_index=True, sort=False)
X_test1 = df_test1.drop('date_first_booking', axis=1, inplace=False)
X_test1 = X_test1.fillna(-1)
id_test = df_test1.index.values

# Make predictions
y_pred = model.predict_proba(X_test1)

In [46]:
df_test1.shape

(62096, 721)

In [47]:
df_test1.columns

Index([u'date_first_booking', u'age', u'gende_unknown', u'gende_male',
       u'gende_female', u'gende_other', u'signu_facebook', u'signu_basic',
       u'signu_google', u'signu_weibo',
       ...
       u'action_detail_view_resolutions', u'action_detail_view_search_results',
       u'action_detail_view_security_checks',
       u'action_detail_view_user_real_names', u'action_detail_wishlist',
       u'action_detail_wishlist_content_update',
       u'action_detail_wishlist_note', u'action_detail_your_listings',
       u'action_detail_your_reservations', u'action_detail_your_trips'],
      dtype='object', length=721)

In [48]:
X_test1.shape

(62096, 720)

### Reference: Parameter Tuning in XGBoost
   #### https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/