In [None]:
<h2 style="text-align: center;" markdown="1">Random Forest Model</h2>

In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('/home/smita/expedia/setting2/train_S2_A1.csv')

In [20]:
test = pd.read_csv('/home/smita/expedia/setting2/test_S2_A1.csv')

In [4]:
train.shape[0] ## 4M

4494597

In [21]:
test.shape[0]

612239

In [5]:
train = train.iloc[:,1:]

In [6]:
train.head(1)

Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,...,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,month,year,day,time
0,2,3,66,462,41898,2716.6746,1482,0,0,1,...,0,1,2,50,214,28,2,2013,15,13:18:43


In [8]:
train.columns.values

array(['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt',
       'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
       'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
       'hotel_country', 'hotel_market', 'hotel_cluster', 'month', 'year',
       'day', 'time'], dtype=object)

In [None]:
<h2 style="text-align: center;" markdown="1">Feature Engineering</h2>

In [None]:
#### Function to Create popularity score based on the click/booking for pair of (user_city, srch_destination)

In [7]:
from collections import defaultdict
def createCityDestScoreDict(df):
    to_from_dict = {}
    columns = df.columns.values
    for index, row in df.iterrows():
            user_city = str(row['user_location_city'])
            srch_dest = str(row['srch_destination_id'])
            is_booking = row['is_booking']
            hotel_cluster = int(row['hotel_cluster'])
              
            if is_booking == '1':
                is_booking = 1 * 0.75
            else:
                is_booking = 1 * 0.25
    
            if user_city and srch_dest:   
                if (user_city,srch_dest) in to_from_dict:
                    if not to_from_dict[(user_city,srch_dest)].get(hotel_cluster):
                        to_from_dict[(user_city,srch_dest)][hotel_cluster] = is_booking
                    else:
                         to_from_dict[(user_city,srch_dest)][hotel_cluster] +=is_booking
                else:
                     to_from_dict[(user_city, srch_dest)] = {hotel_cluster: is_booking}
    return to_from_dict   

In [None]:
#### Fuction to get first n (key:values) from dictionary

In [8]:
from itertools import islice
def sliceDict(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
#### Create popularity_score and add to train set

In [9]:
def createPopScore(df, to_from_dict):
    for index, row in df.iterrows():
        userCity = str(row['user_location_city'])
        srchCity = str(row['srch_destination_id'])
        hotel_cluster = int(row['hotel_cluster']) 
        pop_score = to_from_dict[(userCity, srchCity)].get(hotel_cluster, None)
        df.set_value(index,'pop_score', pop_score)
    return df   

In [None]:
#### Function Duration_of_Stay

In [10]:
from datetime import datetime
def durationOfStay(df):
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    df["time_of_stay"] = (df["srch_co"]-df["srch_ci"]).astype('timedelta64[h]')
    df["time_of_stay"] = df["time_of_stay"]/24
    return df 

In [None]:
#### Get the (user-city, source: score) dictionry and print 1st five

In [11]:
to_from_dict = createCityDestScoreDict(train)
items = sliceDict(5, to_from_dict.iteritems())
items

[(('49966', '8259'),
  {36: 0.25, 42: 0.25, 46: 0.5, 58: 0.5, 69: 0.25, 90: 0.25, 97: 1.5}),
 (('25538', '12243'), {5: 0.25, 37: 0.5, 53: 0.25, 55: 0.25}),
 (('3169', '25506'), {72: 0.25}),
 (('7317', '1455'), {28: 0.25}),
 (('876', '4330'), {32: 0.25})]

In [None]:
### 1. Add pop_score in Train

In [13]:
train = createPopScore(train, to_from_dict=to_from_dict)

In [14]:
train.iloc[0:1, 15:]

Unnamed: 0,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,month,year,day,time,pop_score
0,8857,1,0,1,2,50,214,28,2,2013,15,13:18:43,0.5


In [None]:
### 2.  Add duration_of_stay in Train

In [15]:
train = durationOfStay(train)

In [16]:
train.iloc[0:1, 15:]

Unnamed: 0,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,month,year,day,time,pop_score,time_of_stay
0,8857,1,0,1,2,50,214,28,2,2013,15,13:18:43,0.5,5.0


### 3. Missing Values

In [20]:
train.to_csv('/home/smita/expedia/train_RF.csv')

In [21]:
missing_count = train.count(axis=0)
missing_count

site_name                    21098864
posa_continent               21098864
user_location_country        21098864
user_location_region         21098864
user_location_city           21098864
orig_destination_distance    13695961
user_id                      21098864
is_mobile                    21098864
is_package                   21098864
channel                      21098864
srch_ci                      21057070
srch_co                      21057072
srch_adults_cnt              21098864
srch_children_cnt            21098864
srch_rm_cnt                  21098864
srch_destination_id          21098864
srch_destination_type_id     21098864
is_booking                   21098864
cnt                          21098864
hotel_continent              21098864
hotel_country                21098864
hotel_market                 21098864
hotel_cluster                21098864
month                        21098864
year                         21098864
day                          21098864
time        

##### As of now, we will fill NA with -1:
TODO: better way to impute missing values

In [17]:
train.fillna(-1, inplace=True)

#### Remove Redundant variables (srch_ci, srch_co, time, hotel_cluster) from train

In [18]:
# Exclude Target and Predictors
predictors = [c for c in train.columns if c not in ['hotel_cluster', 'srch_ci', 'srch_co', 'time']]

### Model

In [19]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(n_estimators=5, min_weight_fraction_leaf=0.1)
scores = cross_validation.cross_val_score(model_RF, train[predictors], train['hotel_cluster'], cv=3)
scores



array([ 0.06682218,  0.06692226,  0.05960888])