<h1 style="text-align: center;" markdown="1">Random Forest Model - 3</h1>

<h2 style="text-align: center;" markdown="1">Loading and Reading Data </h2>

In [1]:
import pandas as pd

In [None]:
### Configure path and load the data

In [2]:
#### Configure path
dataRFPath = "/home/smita/MP/dataSet2014_RF.csv"

In [3]:
full_df14 = pd.read_csv(dataRFPath)

In [4]:
full_df14.shape[0]

26483412

In [7]:
full_df14.iloc[1:3, 0:10]

Unnamed: 0.1,Unnamed: 0,srch_destination_id,user_id,is_mobile,is_package,is_booking,hotel_cluster,is_alone,hotel_market,year
1,2,1,1195632,0,1,0,60,0.0,1537,2014
2,3,1,999935,0,0,0,30,0.0,1537,2014


In [8]:
full_df14.iloc[1:3, 11:23]

Unnamed: 0,day,hour,part_of_day,type_of_day,season,duration_of_stay,hotel_market.1,hotel_continent,X,dest_feature_pc1,dest_feature_pc2,dest_feature_pc3
1,28,13,evening,weekday,summer,1.0,1537,5,2.0,-6.605425,-0.249541,0.599817
2,7,9,evening,weekday,winter,1.0,1537,5,2.0,-6.605425,-0.249541,0.599817


In [5]:
full_df14.columns.values

array(['Unnamed: 0', 'srch_destination_id', 'user_id', 'is_mobile',
       'is_package', 'is_booking', 'hotel_cluster', 'is_alone',
       'hotel_market', 'year', 'month', 'day', 'hour', 'part_of_day',
       'type_of_day', 'season', 'duration_of_stay', 'hotel_market.1',
       'hotel_continent', 'X', 'dest_feature_pc1', 'dest_feature_pc2',
       'dest_feature_pc3'], dtype=object)

In [6]:
### Remove indices columns from the data : X and Unnamed: 0 an duplicate column

del full_df14['X']
del full_df14['Unnamed: 0']
del full_df14['hotel_market.1']

In [7]:
full_df14.columns.values

array(['srch_destination_id', 'user_id', 'is_mobile', 'is_package',
       'is_booking', 'hotel_cluster', 'is_alone', 'hotel_market', 'year',
       'month', 'day', 'hour', 'part_of_day', 'type_of_day', 'season',
       'duration_of_stay', 'hotel_continent', 'dest_feature_pc1',
       'dest_feature_pc2', 'dest_feature_pc3'], dtype=object)

In [8]:
# We have 20 predictors excluding response 'hotel_cluster'
full_df14.shape[1]

20

<h2 style="text-align: center;" markdown="1">Data Cleaning </h2>

### Missing Values

In [16]:
full_df14.apply(lambda x: sum(x.isnull()),axis=0) 

srch_destination_id         0
user_id                     0
is_mobile                   0
is_package                  0
is_booking                  0
hotel_cluster               0
is_alone                44100
hotel_market                0
year                        0
month                       0
day                         0
hour                        0
part_of_day                 0
type_of_day                 0
season                      0
duration_of_stay        11848
hotel_market.1              0
hotel_continent             0
dest_feature_pc1       133319
dest_feature_pc2       133319
dest_feature_pc3       133319
dtype: int64

### Imputing missing values

In [18]:
full_df14['is_alone'].value_counts()

0.0    21466123
1.0     4973189
Name: is_alone, dtype: int64

In [9]:
### It's obvious there should be atleast one person , so we will impute missing value as 1: true
full_df14['is_alone'].fillna(1 ,inplace=True)

In [10]:
### we will impute -1 for principal components : as we don't have pc features for all serch_dest_id
full_df14['dest_feature_pc1'].fillna(-1 ,inplace=True)
full_df14['dest_feature_pc2'].fillna(-1 ,inplace=True)
full_df14['dest_feature_pc3'].fillna(-1 ,inplace=True)

In [11]:
### missing values for duration_of_stay : 1
full_df14['duration_of_stay'].fillna(1 ,inplace=True)

In [67]:
### Verify all the  missing values are replaced 
count_cols = full_df14.count(axis=0)
count_cols

srch_destination_id    26483412
user_id                26483412
is_mobile              26483412
is_package             26483412
is_booking             26483412
hotel_cluster          26483412
is_alone               26483412
hotel_market           26483412
year                   26483412
month                  26483412
day                    26483412
hour                   26483412
part_of_day            26483412
type_of_day            26483412
season                 26483412
duration_of_stay       26483412
hotel_continent        26483412
dest_feature_pc1       26483412
dest_feature_pc2       26483412
dest_feature_pc3       26483412
dtype: int64

In [135]:
### Randomly select 50000 user_id
import random
unique_users = full_df14.user_id.unique()
sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 50000)) ]
rf_dat = full_df14[full_df14.user_id.isin(sel_user_ids)]

In [136]:
### downsampling 
rf_dat.shape

(1211346, 20)

In [None]:
### encode the categorical variable to numberic before applying the RF
#part_of_day, type_of_day, season

from sklearn.preprocessing import LabelEncoder
var_mod = ['part_of_day', 'type_of_day', 'season']
le = LabelEncoder()
for i in var_mod:
    rf_dat[i] = le.fit_transform(rf_dat[i])

### Split the Data in Training and Test Set 

In [138]:
train = rf_dat[rf_dat.month < 8]
test = rf_dat[rf_dat.month >= 8]

In [53]:
#free memory
del full_df14
import gc
gc.collect()

260

### Remove Clicks from test set

In [140]:
test = test[test.is_booking == 1]

In [141]:
train.shape

(583242, 20)

In [142]:
test.shape

(44321, 20)

### Binary Classifier with KFold validation

In [143]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from itertools import chain

In [126]:
prob_booking = []
# get the number of unique clusters
unique_clusters = train['hotel_cluster'].unique()

In [127]:
len(unique_clusters)

100

In [144]:
train_c = train.ix[:, :]

In [145]:
train_c.is_copy = False

In [None]:
for cluster in unique_clusters:
    train_c.is_copy = False
    train_c["target"] = 1
    train_c["target"][train_c["hotel_cluster"] != cluster] = 0
    predictors_set1 = [col for col in train if col not in ["hotel_cluster", "target"]]
    probs = []
    cv = KFold(len(train_c["target"]), n_folds=2)
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    for i, (tr, te) in enumerate(cv):
        clf.fit(train_c[predictors_set1].iloc[tr], train_c["target"].iloc[tr])
        preds = clf.predict_proba(train_c[predictors_set1].iloc[te])
        probs.append([p[1] for p in preds])             
    full_probs = chain.from_iterable(probs)
    prob_booking.append(list(full_probs))

In [166]:
prediction_frame.shape

(583242, 100)

In [None]:
prediction_frame = pd.DataFrame(prob_booking).T
prediction_frame.columns = unique_clusters
def find_top_5(row):
    return list(row.nlargest(5).index)

preds = []
for index, row in prediction_frame.iterrows():
    preds.append(find_top_5(row))

In [171]:
import ml_metrics as metrics

In [174]:
# evaluate accuracy for test set
test_set1 = train.iloc[te]

In [175]:
metrics.mapk([[l] for l in test_set1["hotel_cluster"]], preds, k=5)

0.030217759809250139

### Binary classification without KFold validation

In [178]:
test_c = test.ix[:, :]

In [None]:
for cluster in unique_clusters:
    train_c.is_copy = False
    train_c["target"] = 1
    train_c.is_copy = False
    train_c["target"][train_c["hotel_cluster"] != cluster] = 0
    test_c.is_copy = False
    test_c["target"] = 1
    test_c.is_copy = False
    test_c["target"][test_c["hotel_cluster"] != cluster] = 0
    predictors_set1 = [col for col in train if col not in ["hotel_cluster", "target"]]
    probs = []
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    clf.fit(train_c[predictors_set1], train_c["target"])
    preds = clf.predict_proba(test_c[predictors_set1])
    probs.append([p[1] for p in preds])             
    full_probs = chain.from_iterable(probs)
    prob_booking.append(list(full_probs))

In [None]:
prediction_frame = pd.DataFrame(prob_booking).T

In [193]:
t = prediction_frame.iloc[:, 0:100]

In [194]:
t.columns = unique_clusters
def find_top_5(row):
    return list(row.nlargest(5).index)

preds = []
for index, row in t.iterrows():
    preds.append(find_top_5(row))

In [195]:
metrics.mapk([[l] for l in test_c["hotel_cluster"]], preds, k=5)

0.042281687386716602

In [196]:
del full_df14