In [1]:
import sys
sys.path.append('C:/Users/visha/Documents/GitHub/Club_Mahindra_Data_Hack/Club_Mahindra_Data_Hack')

# Import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()

# Import local libraries
from src.data import make_dataset
from src.features import build_features

pd.options.display.max_columns = 100
import warnings
warnings.filterwarnings("ignore")

### Build Data to work on

In [2]:
# Load original data file
raw_data = pd.read_csv(r'../data/raw/train_5CLrC8b/train.csv')
raw_data.shape

(341424, 24)

In [3]:
raw_data.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,resort_type_code,room_type_booked_code,roomnights,season_holidayed_code,state_code_residence,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,05/04/18,05/04/18,06/04/18,3,1,2,0,46,3,3,3,1,2.0,7.0,3,3,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,C,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,23/01/15,11/04/15,16/04/15,1,1,2,0,46,3,3,4,5,2.0,7.0,5,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,A,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,28/01/15,01/02/15,05/02/15,1,1,2,0,47,1,5,4,4,2.0,7.0,1,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,E,A,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,02/05/15,11/06/15,16/06/15,1,1,2,2,46,2,2,3,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,02/09/15,14/12/15,19/12/15,1,1,2,0,46,2,2,4,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,7.059346


In [4]:
# Take a slice (60K observations) of the original data to train the model
interim_data = make_dataset.slice_data(raw_data)

(60000, 24)


In [5]:
# Split the Data into training, validation and test set
train, val, test = make_dataset.split_train_val_test(interim_data)

Training set has shape (38400, 24)
Validation set has shape (9600, 24)
Test set has shape (12000, 24)


In [6]:
# build date related features on all the three datasets
train_df = build_features.date_related_features(train)
val_df = build_features.date_related_features(val)
test_df = build_features.date_related_features(test)

### Build 1st Set of Features

In [7]:
# build features by aggregating values of different columns grouped by memberid
mini_train_df = build_features.aggregate_features_by_memberid(train_df)
mini_val_df = build_features.aggregate_features_by_memberid(val_df)
mini_test_df = build_features.aggregate_features_by_memberid(test_df)

In [8]:
# build features by taking cumulative values of different columns grouped by memberid
mini_train_df_1 = build_features.cumulative_features_by_memberid(train_df, mini_train_df)
mini_val_df_1 = build_features.cumulative_features_by_memberid(val_df, mini_val_df)
mini_test_df_1 = build_features.cumulative_features_by_memberid(test_df, mini_test_df)

In [9]:
# build features by taking time gap on corresponding date features
mini_train_df_2 = build_features.time_gap_shift_1_features(train_df, mini_train_df_1)
mini_val_df_2 = build_features.time_gap_shift_1_features(val_df, mini_val_df_1)
mini_test_df_2 = build_features.time_gap_shift_1_features(test_df, mini_test_df_1)

In [10]:
# build more features by taking time gap on corresponding date features
mini_train_df_3 = build_features.time_gap_shift_2_features(train_df, mini_train_df_2)
mini_val_df_3 = build_features.time_gap_shift_2_features(val_df, mini_val_df_2)
mini_test_df_3 = build_features.time_gap_shift_2_features(test_df, mini_test_df_2)

In [11]:
# build features by taking difference in values of columns for each visit
mini_train_df_4 = build_features.inter_visit_features(train_df, mini_train_df_3)
mini_val_df_4 = build_features.inter_visit_features(val_df, mini_val_df_3)
mini_test_df_4 = build_features.inter_visit_features(test_df, mini_test_df_3)

In [12]:
# # build features by pivoting data on memberid and some specific columns
# mini_train_df_5 = build_features.pivot_features(train_df, mini_train_df_4)
# mini_val_df_5 = build_features.pivot_features(val_df, mini_val_df_4)
# mini_test_df_5 = build_features.pivot_features(test_df, mini_test_df_4)

In [12]:
# Remove initial columns from mini_data
columns_to_remove = ['memberid','resort_id','state_code_residence','checkin_date','booking_date']
mini_train_df_5 = mini_train_df_4.drop(columns_to_remove, axis=1)
mini_val_df_5 = mini_val_df_4.drop(columns_to_remove, axis=1)
mini_test_df_5 = mini_test_df_4.drop(columns_to_remove, axis=1)

In [13]:
# Check shapes of each modified dataset
print(mini_train_df_5.shape)
print(mini_val_df_5.shape)
print(mini_test_df_5.shape)

(38400, 40)
(9600, 40)
(12000, 40)


In [14]:
# Save the 1st set of built features to project drive
mini_train_df_5.to_csv(r'../data/processed/built_features/train_feat_set_1.csv', index=False)
mini_val_df_5.to_csv(r'../data/processed/built_features/val_feat_set_1.csv', index=False)
mini_test_df_5.to_csv(r'../data/processed/built_features/test_feat_set_1.csv', index=False)

### Build More Features

In [15]:
# build ratio_features on all three datasets
train_df_1 = build_features.ratio_features(train_df)
val_df_1 = build_features.ratio_features(val_df)
test_df_1 = build_features.ratio_features(test_df)

In [16]:
# build concatenated features on all three datasets
train_df_2 = build_features.concatenated_features(train_df_1)
val_df_2 = build_features.concatenated_features(val_df_1)
test_df_2 = build_features.concatenated_features(test_df_1)

In [17]:
# Merge the 1st set of features with the modified datasets 
train_df_3 = pd.merge(train_df_2, mini_train_df_5, on='reservation_id', how='left')
val_df_3 = pd.merge(val_df_2, mini_val_df_5, on='reservation_id', how='left')
test_df_3 = pd.merge(test_df_2, mini_test_df_5, on='reservation_id', how='left')

In [18]:
# Save the 2nd set of built features to project drive
train_df_3.to_csv(r'../data/processed/prepared_data/train_prepared.csv', index=False)
val_df_3.to_csv(r'../data/processed/prepared_data/val_prepared.csv', index=False)
test_df_3.to_csv(r'../data/processed/prepared_data/test_prepared.csv', index=False)