In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [2]:
# Load training data
train_data = pd.read_csv('Dataset/Train.csv')
# Load testing data
test_data = pd.read_csv('Dataset/Test.csv')

In [3]:
data_locations = pd.get_dummies(train_data['Location_Type'], prefix='location')

train_data = pd.concat([train_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [4]:
data_parks = pd.get_dummies(train_data['Park_ID'], prefix='Park')

train_data = pd.concat([train_data.drop(['Park_ID'], axis=1), data_parks], axis=1)

In [5]:
train_data.head()

Unnamed: 0,ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,...,Park_30,Park_31,Park_32,Park_33,Park_34,Park_35,Park_36,Park_37,Park_38,Park_39
0,3311712,01-09-1990,194.0,37.24,60.8,15.2,92.13,8225.0,8259.0,8211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3311812,02-09-1990,285.0,32.68,60.8,7.6,14.11,8232.0,8280.0,8205.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3311912,03-09-1990,319.0,43.32,60.8,15.2,35.69,8321.0,8355.0,8283.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3312012,04-09-1990,297.0,25.84,38.0,7.6,0.0249,8379.0,8396.0,8358.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3312112,05-09-1990,207.0,28.88,45.6,7.6,0.83,8372.0,8393.0,8335.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
data_locations = pd.get_dummies(test_data['Location_Type'], prefix='location')

test_data = pd.concat([test_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [7]:
data_parks = pd.get_dummies(test_data['Park_ID'], prefix='Park')

test_data = pd.concat([test_data.drop(['Park_ID'], axis=1), data_parks], axis=1)

In [8]:
test_data.head()

Unnamed: 0,ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,...,Park_30,Park_31,Park_32,Park_33,Park_34,Park_35,Park_36,Park_37,Park_38,Park_39
0,3725712,01-01-2002,233.0,55.48,76.0,38.0,0.0249,8259.0,8300.0,8211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3725812,02-01-2002,211.0,108.68,152.0,60.8,154.38,8208.0,8294.0,8136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3725912,03-01-2002,237.0,95.76,121.6,83.6,34.86,8252.0,8304.0,8146.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3726012,04-01-2002,286.0,101.08,129.2,83.6,34.03,8146.0,8249.0,8092.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3726112,05-01-2002,281.0,63.08,83.6,45.6,4.98,8341.0,8376.0,8259.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def prepare_data(df, is_train):
    df['Average_Breeze_Speed'].fillna(np.median(df['Average_Breeze_Speed'].dropna()), inplace=True)
    df['Min_Breeze_Speed'].fillna(np.median(df['Min_Breeze_Speed'].dropna()), inplace=True)
    df['Max_Breeze_Speed'].fillna(np.median(df['Max_Breeze_Speed'].dropna()), inplace=True)
    df['Direction_Of_Wind'].fillna(np.median(df['Direction_Of_Wind'].dropna()), inplace=True)
#     df['Average_Wind_Speed'] = np.abs(df['Average_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
#     df['Max_Wind_Speed'] = np.abs(df['Max_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
#     df['Min_Wind_Speed'] = np.abs(df['Min_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
#     df_locations = pd.get_dummies(df['Location_Type'], prefix='location')
#     df = pd.concat([df.drop(['Location_Type'], axis=1), df_locations], axis=1)
    df['Average_Atmospheric_Pressure'].fillna(np.median(df['Average_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Max_Atmospheric_Pressure'].fillna(np.median(df['Max_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Min_Atmospheric_Pressure'].fillna(np.median(df['Min_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Average_Moisture_In_Park'].fillna(np.median(df['Average_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Moisture_In_Park'].fillna(np.median(df['Max_Moisture_In_Park'].dropna()), inplace=True)
    df['Min_Moisture_In_Park'].fillna(np.median(df['Min_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Ambient_Pollution'].fillna(np.median(df['Max_Ambient_Pollution'].dropna()), inplace=True)
    df['Min_Ambient_Pollution'].fillna(np.median(df['Min_Ambient_Pollution'].dropna()), inplace=True)
    df['Var1'].fillna(np.median(df['Var1'].dropna()), inplace=True)
    df['Var1'] = np.log(1 + df['Var1'])
#     df['month_bucket'] = 'low'
#     df.loc[((df['month'] == 3) | (df['month'] == 9)), 'month_bucket'] = 'medium'
#     df.loc[((df['month'] > 3) & (df['month'] < 9)), 'month_bucket'] = 'high'
#     train_month_bucket_dummies = pd.get_dummies(df['month_bucket'], prefix='month_bucket')
#     df = pd.concat([df.drop(['month'], axis=1), train_month_bucket_dummies], axis=1)
    if is_train:
        return df.drop(['ID', 'Footfall', 'Date'], axis=1), df['Footfall']
    return df.drop(['ID', 'Date'], axis=1)

In [10]:
train_features, train_target = prepare_data(train_data, 1)

In [11]:
test_features = prepare_data(test_data, 0)

In [12]:
train_features.isnull().any()

Direction_Of_Wind               False
Average_Breeze_Speed            False
Max_Breeze_Speed                False
Min_Breeze_Speed                False
Var1                            False
Average_Atmospheric_Pressure    False
Max_Atmospheric_Pressure        False
Min_Atmospheric_Pressure        False
Min_Ambient_Pollution           False
Max_Ambient_Pollution           False
Average_Moisture_In_Park        False
Max_Moisture_In_Park            False
Min_Moisture_In_Park            False
location_1                      False
location_2                      False
location_3                      False
location_4                      False
Park_12                         False
Park_13                         False
Park_14                         False
Park_15                         False
Park_16                         False
Park_17                         False
Park_18                         False
Park_19                         False
Park_20                         False
Park_21     

In [13]:
train_features.head()

Unnamed: 0,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,...,Park_30,Park_31,Park_32,Park_33,Park_34,Park_35,Park_36,Park_37,Park_38,Park_39
0,194.0,37.24,60.8,15.2,4.533996,8225.0,8259.0,8211.0,92.0,304.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,285.0,32.68,60.8,7.6,2.715357,8232.0,8280.0,8205.0,172.0,332.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,319.0,43.32,60.8,15.2,3.602504,8321.0,8355.0,8283.0,236.0,292.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,297.0,25.84,38.0,7.6,0.024595,8379.0,8396.0,8358.0,272.0,324.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,207.0,28.88,45.6,7.6,0.604316,8372.0,8393.0,8335.0,236.0,332.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_target.head()

0    1406
1    1409
2    1386
3    1365
4    1413
Name: Footfall, dtype: int64

In [15]:
train_data['Date'] = pd.to_datetime(train_data['Date'], format='%d-%m-%Y')
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%d-%m-%Y')

In [16]:
train_data['day'] = train_data['Date'].dt.day
test_data['day'] = test_data['Date'].dt.day
train_data['month'] = train_data['Date'].dt.month
test_data['month'] = test_data['Date'].dt.month
train_data['weekday'] = train_data['Date'].dt.dayofweek
test_data['weekday'] = test_data['Date'].dt.dayofweek
train_data['week'] = train_data['Date'].dt.weekofyear
test_data['week'] = test_data['Date'].dt.weekofyear

In [17]:
train_features['weekday'] = train_data['Date'].dt.dayofweek
test_features['weekday'] = test_data['Date'].dt.dayofweek
train_features['month'] = abs(train_data['month'] - 7)
test_features['month'] = abs(test_data['month'] - 7)
train_features['week'] = train_data['week']
test_features['week'] = test_data['week']
train_features['year'] = train_data['Date'].dt.year
test_features['year'] = test_data['Date'].dt.year

In [18]:
train_features

Unnamed: 0,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,...,Park_33,Park_34,Park_35,Park_36,Park_37,Park_38,Park_39,weekday,month,week
0,194.0,37.24,60.8,15.2,4.533996,8225.0,8259.0,8211.0,92.0,304.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,2,35
1,285.0,32.68,60.8,7.6,2.715357,8232.0,8280.0,8205.0,172.0,332.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,2,35
2,319.0,43.32,60.8,15.2,3.602504,8321.0,8355.0,8283.0,236.0,292.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,36
3,297.0,25.84,38.0,7.6,0.024595,8379.0,8396.0,8358.0,272.0,324.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2,36
4,207.0,28.88,45.6,7.6,0.604316,8372.0,8393.0,8335.0,236.0,332.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,36
5,243.0,74.48,106.4,45.6,4.902530,8263.0,8331.0,8232.0,140.0,320.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,2,36
6,241.0,57.76,68.4,38.0,3.285787,8311.0,8331.0,8263.0,132.0,312.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,2,36
7,227.0,34.20,68.4,15.2,3.991204,8352.0,8376.0,8324.0,88.0,276.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,2,36
8,211.0,17.48,30.4,7.6,0.024595,8400.0,8413.0,8376.0,152.0,292.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,2,36
9,221.0,24.32,53.2,7.6,0.024595,8393.0,8410.0,8386.0,16.0,208.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,37


In [19]:
test_features

Unnamed: 0,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,...,Park_33,Park_34,Park_35,Park_36,Park_37,Park_38,Park_39,weekday,month,week
0,233.0,55.48,76.0,38.0,0.024595,8259.0,8300.0,8211.0,260.0,316.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6,1
1,211.0,108.68,152.0,60.8,5.045874,8208.0,8294.0,8136.0,120.0,280.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,6,1
2,237.0,95.76,121.6,83.6,3.579622,8252.0,8304.0,8146.0,236.0,292.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,6,1
3,286.0,101.08,129.2,83.6,3.556205,8146.0,8249.0,8092.0,204.0,284.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,6,1
4,281.0,63.08,83.6,45.6,1.788421,8341.0,8376.0,8259.0,144.0,316.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,6,1
5,269.0,69.92,76.0,60.8,2.768832,8321.0,8345.0,8294.0,220.0,280.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,6,1
6,261.0,47.12,68.4,15.2,0.024595,8379.0,8413.0,8345.0,240.0,276.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,2
7,245.0,44.08,60.8,22.8,2.315501,8417.0,8454.0,8400.0,96.0,296.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6,2
8,224.0,33.44,45.6,15.2,0.000000,8458.0,8468.0,8451.0,252.0,296.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,6,2
9,212.0,47.88,60.8,38.0,2.033398,8447.0,8458.0,8420.0,216.0,296.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,6,2


In [20]:
train_features.columns

Index([u'Direction_Of_Wind', u'Average_Breeze_Speed', u'Max_Breeze_Speed',
       u'Min_Breeze_Speed', u'Var1', u'Average_Atmospheric_Pressure',
       u'Max_Atmospheric_Pressure', u'Min_Atmospheric_Pressure',
       u'Min_Ambient_Pollution', u'Max_Ambient_Pollution',
       u'Average_Moisture_In_Park', u'Max_Moisture_In_Park',
       u'Min_Moisture_In_Park', u'location_1', u'location_2', u'location_3',
       u'location_4', u'Park_12', u'Park_13', u'Park_14', u'Park_15',
       u'Park_16', u'Park_17', u'Park_18', u'Park_19', u'Park_20', u'Park_21',
       u'Park_22', u'Park_23', u'Park_24', u'Park_25', u'Park_26', u'Park_27',
       u'Park_28', u'Park_29', u'Park_30', u'Park_31', u'Park_32', u'Park_33',
       u'Park_34', u'Park_35', u'Park_36', u'Park_37', u'Park_38', u'Park_39',
       u'weekday', u'month', u'week'],
      dtype='object')

In [21]:
test_features.columns

Index([u'Direction_Of_Wind', u'Average_Breeze_Speed', u'Max_Breeze_Speed',
       u'Min_Breeze_Speed', u'Var1', u'Average_Atmospheric_Pressure',
       u'Max_Atmospheric_Pressure', u'Min_Atmospheric_Pressure',
       u'Min_Ambient_Pollution', u'Max_Ambient_Pollution',
       u'Average_Moisture_In_Park', u'Max_Moisture_In_Park',
       u'Min_Moisture_In_Park', u'location_1', u'location_2', u'location_3',
       u'location_4', u'Park_12', u'Park_13', u'Park_14', u'Park_15',
       u'Park_16', u'Park_17', u'Park_18', u'Park_20', u'Park_21', u'Park_22',
       u'Park_23', u'Park_24', u'Park_25', u'Park_26', u'Park_27', u'Park_28',
       u'Park_29', u'Park_30', u'Park_31', u'Park_32', u'Park_33', u'Park_34',
       u'Park_35', u'Park_36', u'Park_37', u'Park_38', u'Park_39', u'weekday',
       u'month', u'week'],
      dtype='object')

In [22]:
del train_features['Park_19']
train_features.columns == test_features.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [24]:
# # Tuning hyper-parameters for RMSE

# # Set the parameters by cross-validation
# # n_range = range(100, 150, 50)
# tuned_parameters = {'n_estimators': [100]}
# # random_state_range = range(11)
# # tuned_parameters = {'n_estimators': n_range, 'random_state': random_state_range}

# rf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
# rf.fit(train_features, train_target)

# print rf.grid_scores_
# print 'Average Error = ' + str(np.sqrt(np.abs(rf.grid_scores_[0][1])))
# print 'Min Error = ' + str(np.sqrt(np.abs(rf.best_score_)))
# print 'Best parameters = '
# print rf.best_params_
# print rf.best_estimator_

In [25]:
# predicted = rf.predict(test_features)

In [26]:
# solution = pd.DataFrame(test_data['ID'])

In [27]:
# solution = pd.concat([solution, pd.DataFrame(predicted, columns=['Footfall'])], axis=1)

In [28]:
# solution

In [29]:
# solution.to_csv('solution_rf.csv', index=False)

In [30]:
# Import XGBClassifier
from xgboost.sklearn import XGBRegressor

In [37]:
# n_range = range(100, 350, 50)
# max_depth = range(3, 9, 1)
params = {'n_estimators': [700]}
# random_state_range = range(11)
# tuned_parameters = {'n_estimators': n_range, 'random_state': random_state_range}

xgb = GridSearchCV(XGBRegressor(), params, cv=5, scoring='mean_squared_error', verbose=True)
xgb.fit(train_features, train_target)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [700]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='mean_squared_error', verbose=True)

In [32]:
print 'Min Error = ' + str(np.sqrt(np.abs(xgb.best_score_)))
print 'Best parameters = '
print xgb.best_params_
print xgb.best_estimator_

Min Error = 90.4887960714
Best parameters = 
{'n_estimators': 700}
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [33]:
predicted_xgb = xgb.predict(test_features)

In [34]:
solution_xgb = pd.DataFrame(test_data['ID'])

In [35]:
solution_xgb = pd.concat([solution_xgb, pd.DataFrame(predicted_xgb, columns=['Footfall'])], axis=1)

In [36]:
solution_xgb.to_csv('solution_xgb_90_median.csv', index=False)