In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime as dt 
import time

In [6]:
testdata = pd.read_csv("booking_test_set.csv", index_col=0)
traindata = pd.read_csv("booking_train_set.csv", index_col=0)

In [7]:
# cast dates to datetime format
for df in [traindata, testdata]:
    df['checkin'] = pd.to_datetime(df['checkin'])
    df['checkout'] = pd.to_datetime(df['checkout'])

In [8]:
traindata.head(10)

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1
1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1
2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1
3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1
4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1
5,1010293,2016-07-10,2016-07-11,55,mobile,359,The Devilfire Empire,Cobra Island,1010293_1
6,1010293,2016-07-12,2016-07-13,23921,mobile,359,The Devilfire Empire,Cobra Island,1010293_1
7,1010293,2016-07-13,2016-07-15,65322,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1
8,1010293,2016-07-15,2016-07-16,23921,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1
9,1010293,2016-07-16,2016-07-17,20545,desktop,10573,The Devilfire Empire,Cobra Island,1010293_1


In [36]:
# series that holds a list of countries visited on each trip
orderofcountriesvisitedpertrip = traindata.groupby('utrip_id')['hotel_country'].agg(lambda x: list(x))

utrip_id
1000027_1                     [Gondal, Gondal, Gondal, Gondal]
1000033_1    [Cobra Island, Cobra Island, Cobra Island, Cob...
1000045_1    [Fook Island, Fook Island, Fook Island, Carpat...
1000083_1            [Osterlich, Osterlich, Osterlich, Gondal]
100008_1     [Kamistan, Kamistan, Kamistan, Kamistan, Kamis...
                                   ...                        
999776_1      [Novistrana, Novistrana, Novistrana, Novistrana]
999839_1     [Cobra Island, Cobra Island, Cobra Island, Cob...
999842_1     [Glubbdubdrib, Glubbdubdrib, Glubbdubdrib, Alv...
999855_1     [Fook Island, Fook Island, Fook Island, Fook I...
999944_1     [Glubbdubdrib, Glubbdubdrib, Glubbdubdrib, Glu...
Name: hotel_country, Length: 217686, dtype: object

In [45]:
orderofcountriesvisitedpertrip

utrip_id
1000027_1                     [Gondal, Gondal, Gondal, Gondal]
1000033_1    [Cobra Island, Cobra Island, Cobra Island, Cob...
1000045_1    [Fook Island, Fook Island, Fook Island, Carpat...
1000083_1            [Osterlich, Osterlich, Osterlich, Gondal]
100008_1     [Kamistan, Kamistan, Kamistan, Kamistan, Kamis...
                                   ...                        
999776_1      [Novistrana, Novistrana, Novistrana, Novistrana]
999839_1     [Cobra Island, Cobra Island, Cobra Island, Cob...
999842_1     [Glubbdubdrib, Glubbdubdrib, Glubbdubdrib, Alv...
999855_1     [Fook Island, Fook Island, Fook Island, Fook I...
999944_1     [Glubbdubdrib, Glubbdubdrib, Glubbdubdrib, Glu...
Name: hotel_country, Length: 217686, dtype: object

In [43]:
# series that holds a list of cities visited on each trip
orderofcitiesvisitedpertrip = traindata.groupby('utrip_id')['city_id'].agg(lambda x: list(x))

In [44]:
orderofcitiesvisitedpertrip

utrip_id
1000027_1                          [8183, 15626, 60902, 30628]
1000033_1                  [38677, 52089, 21328, 27485, 38677]
1000045_1     [64876, 55128, 9608, 31817, 36170, 58178, 36063]
1000083_1                         [55990, 14705, 35160, 36063]
100008_1                     [11306, 12096, 6761, 6779, 65690]
                                   ...                        
999776_1                          [17775, 66634, 17775, 17775]
999839_1                            [8335, 21328, 8335, 48968]
999842_1                          [51291, 66969, 67169, 24036]
999855_1     [382, 38509, 18930, 38509, 51145, 11179, 61881...
999944_1                            [17944, 47075, 228, 62930]
Name: city_id, Length: 217686, dtype: object

In [52]:
data = {'utrip_id':orderofcitiesvisitedpertrip.keys(), 
        'orderofcitiesvisitedpertrip':orderofcitiesvisitedpertrip.values,
     'orderofcountriesvisitedpertrip':orderofcountriesvisitedpertrip.values}
data = pd.DataFrame(data)

In [55]:
data.head(10)

Unnamed: 0,utrip_id,orderofcitiesvisitedpertrip,orderofcountriesvisitedpertrip
0,1000027_1,"[8183, 15626, 60902, 30628]","[Gondal, Gondal, Gondal, Gondal]"
1,1000033_1,"[38677, 52089, 21328, 27485, 38677]","[Cobra Island, Cobra Island, Cobra Island, Cob..."
2,1000045_1,"[64876, 55128, 9608, 31817, 36170, 58178, 36063]","[Fook Island, Fook Island, Fook Island, Carpat..."
3,1000083_1,"[55990, 14705, 35160, 36063]","[Osterlich, Osterlich, Osterlich, Gondal]"
4,100008_1,"[11306, 12096, 6761, 6779, 65690]","[Kamistan, Kamistan, Kamistan, Kamistan, Kamis..."
5,1000097_1,"[17127, 31088, 40521, 55128, 21033, 6306, 6788...","[Fook Island, Fook Island, Fook Island, Fook I..."
6,1000136_1,"[62541, 42482, 20345, 33540, 32627]","[Fook Island, Fook Island, Fook Island, Fook I..."
7,1000145_1,"[47499, 27112, 17764, 56651, 35850]","[Kangan, Rook Islands, Rook Islands, Rook Isla..."
8,100018_1,"[17830, 57619, 22065, 2748, 46854, 5797, 57658]","[Bartovia, Leutonia, Leutonia, Leutonia, Leuto..."
9,1000208_1,"[60143, 1910, 9278, 51999, 56872]","[Patusan, Patusan, Patusan, Patusan, Patusan]"


In [54]:
# series that holds a list of cities visited on each trip
citiesincountry = traindata.groupby('hotel_country')['city_id'].agg(lambda x: list(x))
citiesincountry

hotel_country
Absurdistan                       [47440, 52136, 42121, 52136, 62316, 52136, 474...
Aldorria                          [64131, 34751, 34751, 20862, 58136, 58136, 457...
Aldovia                           [52815, 48905, 33022, 52815, 33022, 48905, 586...
Almaigne                          [41220, 41220, 41220, 41220, 41220, 41220, 41220]
Altis and Stratis, Republic of    [53731, 51329, 29723, 50260, 50260, 50260, 521...
                                                        ...                        
Yellow Empire                     [10235, 10235, 25374, 60664, 10235, 22115, 102...
Yerba                             [37709, 19626, 62270, 24783, 8587, 31627, 5109...
Yudonia                           [18548, 18548, 909, 18548, 34985, 43062, 19011...
Zekistan                          [267, 33625, 267, 267, 267, 33625, 267, 33625,...
Zephyria                          [64502, 11640, 52564, 64502, 18343, 49475, 494...
Name: city_id, Length: 195, dtype: object

In [66]:
# pull out the second last city of each utrip_id into a series
lastcityoftripseries = orderofcitiesvisitedpertrip.apply(lambda x: x[len(x)-2])
lastcityoftripseries


utrip_id
1000027_1    60902
1000033_1    27485
1000045_1    58178
1000083_1    35160
100008_1      6779
             ...  
999776_1     17775
999839_1      8335
999842_1     67169
999855_1     44489
999944_1       228
Name: city_id, Length: 217686, dtype: int64

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
labelencoder = LabelEncoder()
enc = OneHotEncoder(handle_unknown='ignore')

In [9]:
traindata['hotel_country_cat'] = labelencoder.fit_transform(traindata['hotel_country']).astype(int)

In [11]:
hotel_country_df = pd.DataFrame(enc.fit_transform(traindata[['hotel_country_cat']]).toarray()).astype(int)

MemoryError: Unable to allocate 1.70 GiB for an array with shape (227532825,) and data type float64