<h1 style="text-align: center;" markdown="1">Random Forest Model - 1</h1>

In [1]:
import pandas as pd

### Configure path and load the data

In [5]:
#### Configure path
dataRFPath = "/home/smita/MP/train_all_2014.csv"

In [6]:
full_df14 = pd.read_csv(dataRFPath)

In [11]:
full_df14.shape

(26483412, 25)

In [8]:
full_df14.columns.values

array(['Unnamed: 0', 'date_time', 'site_name', 'posa_continent',
       'user_location_country', 'user_location_region',
       'user_location_city', 'orig_destination_distance', 'user_id',
       'is_mobile', 'is_package', 'channel', 'srch_ci', 'srch_co',
       'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_id', 'srch_destination_type_id', 'is_booking',
       'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster', 'year'], dtype=object)

In [9]:
### Remove indices columns from the data : X and Unnamed: 0 an duplicate column

del full_df14['Unnamed: 0']

In [10]:
full_df14.iloc[1:3, 0:10]

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0


In [12]:
full_df14.iloc[1:3, 11:20]

Unnamed: 0,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt
1,2014-08-29,2014-09-02,2,0,1,8250,1,1,1
2,2014-08-29,2014-09-02,2,0,1,8250,1,0,1


In [13]:
full_df14.iloc[1:3, 21:25]

Unnamed: 0,hotel_country,hotel_market,hotel_cluster,year
1,50,628,1,2014
2,50,628,1,2014


<h2 style="text-align: center;" markdown="1">Feature Engineering</h2>

#### Function to Create popularity score based on the click/booking for pair of (user_city, srch_destination)

In [14]:
from collections import defaultdict
def createCityDestScoreDict(df):
    to_from_dict = {}
    columns = df.columns.values
    for index, row in df.iterrows():
            user_city = str(row['user_location_city'])
            srch_dest = str(row['srch_destination_id'])
            is_booking = row['is_booking']
            hotel_cluster = int(row['hotel_cluster'])
              
            if is_booking == '1':
                is_booking = 1 * 0.75
            else:
                is_booking = 1 * 0.25
    
            if user_city and srch_dest:   
                if (user_city,srch_dest) in to_from_dict:
                    if not to_from_dict[(user_city,srch_dest)].get(hotel_cluster):
                        to_from_dict[(user_city,srch_dest)][hotel_cluster] = is_booking
                    else:
                         to_from_dict[(user_city,srch_dest)][hotel_cluster] +=is_booking
                else:
                     to_from_dict[(user_city, srch_dest)] = {hotel_cluster: is_booking}
    return to_from_dict   

#### Fuction to get first n (key:values) from dictionary

In [15]:
from itertools import islice
def sliceDict(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

#### Create popularity_score and add to train set

In [48]:
def createPopScore(df, to_from_dict):
    for index, row in df.iterrows():
        userCity = str(row['user_location_city'])
        srchCity = str(row['srch_destination_id'])
        hotel_cluster = int(row['hotel_cluster']) 
        if(userCity, srchCity) in to_from_dict:
               pop_score = to_from_dict[(userCity, srchCity)].get(hotel_cluster, None)
        else:
               pop_score = 0
        df.set_value(index,'pop_score', pop_score)
    return df   

#### Function Duration_of_Stay

In [17]:
from datetime import datetime
def durationOfStay(df):
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    df["time_of_stay"] = (df["srch_co"]-df["srch_ci"]).astype('timedelta64[h]')
    df["time_of_stay"] = df["time_of_stay"]/24
    return df 

In [18]:
full_df14 = durationOfStay(full_df14)

In [19]:
full_df14.shape

(26483412, 26)

In [26]:
### Randomly select 40000 user_id
import random
unique_users = full_df14.user_id.unique()
sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 40000)) ]
rf_dat = full_df14[full_df14.user_id.isin(sel_user_ids)]

In [27]:
rf_dat.shape

(972556, 26)

In [None]:
rf_dat["date_time"] = pd.to_datetime(rf_dat["date_time"])
rf_dat["month"] = rf_dat["date_time"].dt.month

### Split the Data in Training and Test Set 

In [33]:
train = rf_dat[rf_dat.month <= 9]
test = rf_dat[rf_dat.month > 9]

In [34]:
train.shape

(666903, 27)

In [35]:
test.shape

(305653, 27)

#### Get the (user-city, source: score) dictionry and print 1st five

In [None]:
to_from_dict = createCityDestScoreDict(train)

In [38]:
items = sliceDict(5, to_from_dict.items())
items

[(('53906', '27714'), {94: 0.25}),
 (('52320', '8747'), {90: 0.5}),
 (('21728', '114'), {30: 0.25}),
 (('21728', '8262'),
  {4: 0.25,
   9: 0.25,
   18: 1.25,
   33: 0.5,
   37: 0.5,
   40: 0.5,
   41: 0.75,
   50: 0.25,
   51: 0.25,
   53: 0.5,
   68: 6.5,
   69: 1.25,
   70: 0.5,
   72: 0.25,
   77: 0.5,
   90: 0.5,
   93: 0.25,
   95: 1.5,
   98: 2.0}),
 (('30019', '8746'), {99: 0.5})]

### 1. Add pop_score in Train

In [None]:
train = createPopScore(train, to_from_dict=to_from_dict)

In [40]:
train.iloc[0:1, 15:]

Unnamed: 0,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,year,time_of_stay,month,pop_score
2708,1,45326,6,0,4,2,50,675,70,2014,2.0,1,0.5


### 2. Add pop_score in Test

In [None]:
test = createPopScore(test, to_from_dict=to_from_dict)

In [50]:
test.iloc[0:1, 15:]

Unnamed: 0,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,year,time_of_stay,month,pop_score
3846,1,8261,1,0,1,2,50,646,21,2014,3.0,11,0.0


### 3. Missing Values

In [20]:
train.to_csv('/home/smita/expedia/train_RF.csv')

In [42]:
missing_count = train.count(axis=0)
missing_count

date_time                    666903
site_name                    666903
posa_continent               666903
user_location_country        666903
user_location_region         666903
user_location_city           666903
orig_destination_distance    429719
user_id                      666903
is_mobile                    666903
is_package                   666903
channel                      666903
srch_ci                      666504
srch_co                      666504
srch_adults_cnt              666903
srch_children_cnt            666903
srch_rm_cnt                  666903
srch_destination_id          666903
srch_destination_type_id     666903
is_booking                   666903
cnt                          666903
hotel_continent              666903
hotel_country                666903
hotel_market                 666903
hotel_cluster                666903
year                         666903
time_of_stay                 666504
month                        666903
pop_score                   

##### As of now, we will fill NA with -1:
TODO: better way to impute missing values

In [None]:
## replace NA's in orig_destination_distance by -1
train.fillna(-1, inplace=True)

In [None]:
## replace NA's in orig_destination_distance by -1
test.fillna(-1, inplace=True)

In [44]:
missing_count = train.count(axis=0)
missing_count

date_time                    666903
site_name                    666903
posa_continent               666903
user_location_country        666903
user_location_region         666903
user_location_city           666903
orig_destination_distance    666903
user_id                      666903
is_mobile                    666903
is_package                   666903
channel                      666903
srch_ci                      666903
srch_co                      666903
srch_adults_cnt              666903
srch_children_cnt            666903
srch_rm_cnt                  666903
srch_destination_id          666903
srch_destination_type_id     666903
is_booking                   666903
cnt                          666903
hotel_continent              666903
hotel_country                666903
hotel_market                 666903
hotel_cluster                666903
year                         666903
time_of_stay                 666903
month                        666903
pop_score                   

In [45]:
# Exclude Target and Predictors
predictors = [c for c in train.columns if c not in ['hotel_cluster', 'srch_ci', 'srch_co', 'date_time', 'user_id']]

### Model

In [46]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(n_estimators=5, min_weight_fraction_leaf=0.1)
scores = cross_validation.cross_val_score(model_RF, train[predictors], train['hotel_cluster'], cv=3)
scores

array([ 0.06450539,  0.06901996,  0.06081425])

In [54]:
model_RF.fit(train[predictors], train['hotel_cluster'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.1, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### use model to predict outcome for test dataset

In [55]:
test_score = model_RF.score(test[predictors], test['hotel_cluster'])
test_score

0.056246789660170195

#### Feature importance

In [62]:
import numpy as np
importance = model_RF.feature_importances_
std = np.std([model_RF.feature_importances_ for tree in model_RF.estimators_], axis=0)
indices = np.argsort(importance)[::-1]
print ("Feature sorted by their scores:")
df = sorted(zip(map(lambda x:round(x,4), model_RF.feature_importances_), predictors), reverse=True)
imp = pd.DataFrame(df)
imp.columns = ['Score', 'Variable']
imp

Feature sorted by their scores:


Unnamed: 0,Score,Variable
0,0.5012,hotel_continent
1,0.1781,orig_destination_distance
2,0.1069,hotel_country
3,0.0973,srch_destination_id
4,0.0843,time_of_stay
5,0.0207,pop_score
6,0.0068,user_location_region
7,0.0033,srch_children_cnt
8,0.001,user_location_city
9,0.0003,month


In [64]:
# take only imp variables
predictors_set1 = [c for c in train.columns if c not in ['hotel_cluster', 'srch_ci', 'srch_co', 'date_time', 'user_id', 'month'
                                                   ,'year', 'user_location_country', 'srch_rm_cnt', 'srch_destination_type_id',
                                                  'srch_adults_cnt', 'srch_destination_type_id', 'srch_adults_cnt',
                                                  'site_name', 'posa_continent', 'is_package', 'is_mobile','is_booking', 'cnt', 'channel']]

In [63]:
test_books = test[test.is_booking == 1]

### Model 2

In [66]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

model2_RF = RandomForestClassifier(n_estimators=5, min_weight_fraction_leaf=0.1)
scores = cross_validation.cross_val_score(model2_RF, train[predictors_set1], train['hotel_cluster'], cv=3)
scores

array([ 0.06339897,  0.06259615,  0.06642012])

In [68]:
model2_RF.fit(train[predictors_set1], train['hotel_cluster'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.1, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
test_score = model2_RF.score(test_books[predictors_set1], test_books['hotel_cluster'])
test_score

0.062908310110100965