In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [27]:
with open('trainingset.csv', 'r') as csvfile:
    df = pd.read_csv(csvfile, dtype={'booking_bool':bool,'srch_destination_id':np.int32, 
                                     'site_id':np.int32, 'visitor_location_country_id':np.int32,
                                     'visitor_hist_starrating':np.float, 'visitor_hist_adr_usd':np.float,
                                     'prop_country_id':np.int32, 'prop_id':np.int32, 
                                     'prop_starrating':np.int32, 'prop_review_score':np.float,
                                     'prop_brand_bool':bool, 'prop_location_score1':np.float,
                                     'prop_location_score2':np.float, 'prop_log_historical_price':np.float,
                                     'price_usd':np.float, 'promotion_flag':bool,\
                                     'srch_length_of_stay':np.int32, 'srch_booking_window':np.int32,
                                     'srch_adults_count':np.int32, 'srch_children_count':np.int32,
                                     'srch_room_count':np.int32, 'srch_saturday_night_bool':bool,
                                     'srch_query_affinity_score':np.float, 'orig_destination_distance':np.float,
                                     'random_bool':bool, 'position':np.int32, 'click_bool':bool,
                                     'booking_bool':bool, 'gross_bookings_usd':np.float})

In [28]:
with open('testset.csv', 'r') as csvfile:
    dfTest = pd.read_csv(csvfile, dtype={'booking_bool':bool,'srch_destination_id':np.int32, 
                                     'site_id':np.int32, 'visitor_location_country_id':np.int32,
                                     'visitor_hist_starrating':np.float, 'visitor_hist_adr_usd':np.float,
                                     'prop_country_id':np.int32, 'prop_id':np.int32, 
                                     'prop_starrating':np.int32, 'prop_review_score':np.float,
                                     'prop_brand_bool':bool, 'prop_location_score1':np.float,
                                     'prop_location_score2':np.float, 'prop_log_historical_price':np.float,
                                     'price_usd':np.float, 'promotion_flag':bool,\
                                     'srch_length_of_stay':np.int32, 'srch_booking_window':np.int32,
                                     'srch_adults_count':np.int32, 'srch_children_count':np.int32,
                                     'srch_room_count':np.int32, 'srch_saturday_night_bool':bool,
                                     'srch_query_affinity_score':np.float, 'orig_destination_distance':np.float,
                                     'random_bool':bool})

### Dates

In [32]:
df["date_time"] = pd.to_datetime(df["date_time"])
df["year"] = df["date_time"].dt.year
df["month"] = df["date_time"].dt.month

### Remove outliers price_usd

In [33]:
top2point5 = df.price_usd.quantile(0.975)
low2point5 = df.price_usd.quantile(0.025)
df = df[df.price_usd < top2point5]
df = df[df.price_usd > low2point5]

### Fill in missing values with worst case scenario

In [34]:
df.prop_location_score2.loc[np.isnan(df.prop_location_score2)] = 0

In [35]:
df.visitor_hist_starrating.loc[np.isnan(df.visitor_hist_starrating)] = 0
df.visitor_hist_adr_usd.loc[np.isnan(df.visitor_hist_adr_usd)] = 0

In [36]:
df.orig_destination_distance[np.isnan(df.orig_destination_distance)] = 0 
df.prop_review_score[np.isnan(df.prop_review_score)] = 0
df.srch_query_affinity_score[np.isnan(df.srch_query_affinity_score)] = 0


In [37]:
df.iloc[:,27:51] = df.iloc[:,27:51].fillna(value = 0, axis = 0)

### Composite features

In [65]:
df['prop_id_counts'] = df.groupby(['prop_id'])['prop_id'].transform('count')

In [38]:
df['usd_diff'] = abs(df.price_usd - df.visitor_hist_adr_usd)

In [39]:
df['star_diff'] = abs(df.prop_starrating - df.visitor_hist_starrating) 

In [40]:
df.prop_log_historical_price.loc[np.isnan(df.prop_log_historical_price)] = 0
df['diff_hist_price'] = np.exp(df.prop_log_historical_price) - df.price_usd

In [41]:
#Total price/per night
df['total_price'] = df.price_usd * df.srch_room_count

In [42]:
#Price per person
df['price_per_pers'] = (df.price_usd * df.srch_room_count)/(df.srch_adults_count + df.srch_children_count)

In [43]:
df['prop_score'] = (df.prop_location_score2 + 0.00001)/(df.prop_location_score1 + 0.00001)

In [44]:
df['nr_pers'] =  df.srch_adults_count + df.srch_children_count

### Rank features

In [64]:
df['price_rank'] = df.groupby('srch_id', sort=False)['price_usd'].rank(ascending=True, method ="min")
df['stars_rank'] = df.groupby('srch_id', sort=False)['prop_starrating'].rank(ascending=True, method = "min")
df['score_rank'] = df.groupby('srch_id', sort=False)['prop_location_score2'].rank(ascending=True, method = "min")

### Estimate position:

In [57]:
print len(set(df.prop_id.unique()).intersection(set(dfTest.prop_id.unique())))
print len(df.prop_id.unique()), len(dfTest.prop_id.unique())
print len(set(dfTest.prop_id.unique()) - set(df.prop_id.unique()))



117706
124574 129438
11732


In [52]:
df['position_estimate'] = df.groupby(['prop_id','month','srch_destination_id'])['position'].transform(np.mean)
df.position_estimate = np.int32(df.position_estimate)

In [61]:
#print df[['position','position_estimate']].head(100)
from sklearn.metrics import mean_absolute_error
print mean_absolute_error(df.position,df.position_estimate)

4.6993649456278


In [66]:
header = ['srch_id','prop_country_id','year', 'month', 'prop_score','prop_location_score1','prop_location_score2','diff_hist_price',
                'usd_diff', 'star_diff', 'srch_query_affinity_score', 'orig_destination_distance',
                'prop_review_score','position','price_rank','stars_rank','score_rank', 'booking_bool', 'click_bool', 'price_per_pers',
        'random_bool', 'price_usd', 'total_price', 'nr_pers','prop_id_counts', 'comp2_rate', 'comp5_rate','comp8_rate']

In [67]:
df.to_csv('new_train.csv', columns = header)

In [63]:
df.position_estimate.isnull().any()

False