In [1]:
import configparser
import os
from joblib import dump, load
import json
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from joblib import dump, load

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
# os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_csv(config['PATH']['DATA_DIR'] + '/training_set.csv')
# df

# mini df for testing quickly 
df_mini = df.iloc[:1000, :].copy()
# df.drop('Unnamed: 0', axis=1, inplace=True)

# time to datetime
# df['time'] = pd.to_datetime(df['time'])


In [3]:
def construct_target(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    else:
        return 0

def binary_target(row):
    if row['booking_bool'] == 1:
        return 1
    else:
        return 0

def drop_cols(df, cols):
    return df.drop(cols, axis=1)

In [4]:
df_1 = df

# Construct target and drop leaky columns
df_1['target'] = df_1.apply(binary_target, axis=1)
df_1['grades'] = df_1.apply(construct_target, axis=1)
df_1 = df_1.drop(['random_bool', 'click_bool', 'booking_bool', 'date_time', 'gross_bookings_usd'], axis=1)

## Test: Drop all columns with NaNs
df_1 = df_1.dropna(axis=1)
df_1

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,target,grades
0,1,12,187,219,893,3,1,2.83,4.95,27,104.77,0,23246,1,0,4,0,1,1,0,0
1,1,12,187,219,10404,4,1,2.20,5.03,26,170.74,0,23246,1,0,4,0,1,1,0,0
2,1,12,187,219,21315,3,1,2.20,4.92,21,179.80,0,23246,1,0,4,0,1,1,0,0
3,1,12,187,219,27348,2,1,2.83,4.39,34,602.77,0,23246,1,0,4,0,1,1,0,0
4,1,12,187,219,29604,4,1,2.64,4.93,4,143.58,0,23246,1,0,4,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,5,219,219,77700,3,1,1.61,0.00,2,118.00,0,16974,1,21,3,0,1,0,0,0
4958343,332785,5,219,219,88083,3,1,1.95,0.00,3,89.00,0,16974,1,21,3,0,1,0,0,0
4958344,332785,5,219,219,94508,3,1,1.10,0.00,4,99.00,0,16974,1,21,3,0,1,0,0,0
4958345,332785,5,219,219,128360,3,1,1.95,0.00,1,139.00,0,16974,1,21,3,0,1,0,1,5


In [6]:
# Perform logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

# Split data
# splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
# split = splitter.split(X, groups=X['srch_id'])
# train_inds, test_inds = next(split)

# X = df_1.drop(['target', 'grades'], axis=1)
# y = df_1['target']
# y_grades = df['grades']


def train_test_split(df, target_str, test_size=.2):
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=2, random_state = 7)
    split = splitter.split(df, groups=df['srch_id'])
    train_inds, test_inds = next(split)

    df_ideal = df.iloc[test_inds].copy().sort_values(by=['srch_id', 'grades'], ascending=[True, False], inplace=False)

    X = df_1.drop(['target', 'grades'], axis=1)
    y = df_1[target_str]
    X_train, X_test, y_train, y_test, test_ideal = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds], df_ideal, 


    return X_train, X_test, y_train, y_test, test_ideal


X_train, X_test, y_train, y_test, test_ideal = train_test_split(df_1, 'target')

In [8]:
test_ideal[:100]

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,target,grades
145,12,5,219,158,129310,3,0,0.0,3.75,1,27.0,0,26648,2,21,3,0,1,1,0,1
119,12,5,219,158,7814,3,0,0.0,4.46,25,43.0,1,26648,2,21,3,0,1,1,0,0
120,12,5,219,158,10881,3,0,3.14,4.46,28,44.0,0,26648,2,21,3,0,1,1,0,0
121,12,5,219,158,12510,4,0,1.1,4.41,24,51.0,0,26648,2,21,3,0,1,1,0,0
122,12,5,219,158,17122,4,0,2.64,4.22,13,53.0,0,26648,2,21,3,0,1,1,0,0
123,12,5,219,158,18012,5,0,3.22,5.66,18,221.0,0,26648,2,21,3,0,1,1,0,0
124,12,5,219,158,29081,3,0,3.3,4.41,3,37.0,0,26648,2,21,3,0,1,1,0,0
125,12,5,219,158,43250,3,1,3.3,4.46,14,40.0,0,26648,2,21,3,0,1,1,0,0
126,12,5,219,158,45722,3,0,3.26,4.46,4,29.0,1,26648,2,21,3,0,1,1,0,0
127,12,5,219,158,47902,3,0,3.26,4.46,22,49.0,0,26648,2,21,3,0,1,1,0,0


In [44]:
df

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target,grades
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0,0.0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.20,0.0149,5.03,26,170.74,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0,,0,0,0.0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.20,0.0245,4.92,21,179.80,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0,0.0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,0,23246,1,0,4,0,1,1,,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0,,0,0,0.0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,1,1.61,0.0471,0.00,2,118.00,0,16974,1,21,3,0,1,0,,550.92,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,0,0,0.0
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,1,1.95,0.1520,0.00,3,89.00,0,16974,1,21,3,0,1,0,,553.14,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,0,0,0.0
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,1,1.10,0.0164,0.00,4,99.00,0,16974,1,21,3,0,1,0,,544.43,0,,,,,,,,,,,,,,,,,,,,,,,,,0,,0,0,0.0
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,1,1.95,0.0662,0.00,1,139.00,0,16974,1,21,3,0,1,0,,550.38,0,,,,,,,,,,,,,,,,,,,,,,,,,1,157.84,1,1,1.0


In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
preds1 = log_reg.predict_proba(X_test)

In [24]:
preds1  

array([[0.96306605, 0.03693395],
       [0.96537174, 0.03462826],
       [0.96381922, 0.03618078],
       ...,
       [0.97866637, 0.02133363],
       [0.9744209 , 0.0255791 ],
       [0.97439504, 0.02560496]])

In [19]:
df_mini[:30]

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.2,0.0149,5.03,26,170.74,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.2,0.0245,4.92,21,179.8,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,0,23246,1,0,4,0,1,1,,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
5,1,2013-04-04 08:32:15,12,187,,,219,30184,4,4.5,1,2.77,0.1302,5.2,7,195.32,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,7.0,,,,0.0,0.0,,,,,,,,0.0,0.0,7.0,0,,0
6,1,2013-04-04 08:32:15,12,187,,,219,44147,3,3.5,1,2.2,0.0356,4.81,18,129.35,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,,0,,0
7,1,2013-04-04 08:32:15,12,187,,,219,50984,2,0.0,0,1.61,,4.14,35,85.37,0,23246,1,0,4,0,1,1,,,1,,,,,,,,,,,,,,,,,,,,,,,,,0,,0
8,1,2013-04-04 08:32:15,12,187,,,219,53341,4,4.0,1,2.56,0.1238,5.18,3,150.05,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,6.0,0,,0
9,1,2013-04-04 08:32:15,12,187,,,219,56880,4,4.0,1,2.83,0.1028,5.15,10,280.69,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0


## Evaluation


In [8]:
nunique_list = []
for col in df.columns:
    nunique_list.append(df[col].nunique())
pd.DataFrame({'col': df.columns, 'nunique': nunique_list})

Unnamed: 0,col,nunique
0,srch_id,199795
1,date_time,198615
2,site_id,34
3,visitor_location_country_id,210
4,visitor_hist_starrating,312
5,visitor_hist_adr_usd,7799
6,prop_country_id,172
7,prop_id,129113
8,prop_starrating,6
9,prop_review_score,10


In [9]:
df['booking_bool'].value_counts()

booking_bool
0    4819957
1     138390
Name: count, dtype: int64