In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [2]:
# Load training data
train_data = pd.read_csv('Dataset/Train.csv')
# Load testing data
test_data = pd.read_csv('Dataset/Test.csv')

In [3]:
data_locations = pd.get_dummies(train_data['Location_Type'], prefix='location')

train_data = pd.concat([train_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [4]:
train_data.head()

Unnamed: 0,ID,Park_ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,...,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,Footfall,location_1,location_2,location_3,location_4
0,3311712,12,01-09-1990,194.0,37.24,60.8,15.2,92.13,8225.0,8259.0,...,92.0,304.0,255.0,288.0,222.0,1406,0.0,0.0,1.0,0.0
1,3311812,12,02-09-1990,285.0,32.68,60.8,7.6,14.11,8232.0,8280.0,...,172.0,332.0,252.0,297.0,204.0,1409,0.0,0.0,1.0,0.0
2,3311912,12,03-09-1990,319.0,43.32,60.8,15.2,35.69,8321.0,8355.0,...,236.0,292.0,219.0,279.0,165.0,1386,0.0,0.0,1.0,0.0
3,3312012,12,04-09-1990,297.0,25.84,38.0,7.6,0.0249,8379.0,8396.0,...,272.0,324.0,225.0,261.0,192.0,1365,0.0,0.0,1.0,0.0
4,3312112,12,05-09-1990,207.0,28.88,45.6,7.6,0.83,8372.0,8393.0,...,236.0,332.0,234.0,273.0,183.0,1413,0.0,0.0,1.0,0.0


In [5]:
data_locations = pd.get_dummies(test_data['Location_Type'], prefix='location')

test_data = pd.concat([test_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [6]:
test_data.head()

Unnamed: 0,ID,Park_ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,location_1,location_2,location_3,location_4
0,3725712,12,01-01-2002,233.0,55.48,76.0,38.0,0.0249,8259.0,8300.0,8211.0,260.0,316.0,243.0,285.0,210.0,0.0,0.0,1.0,0.0
1,3725812,12,02-01-2002,211.0,108.68,152.0,60.8,154.38,8208.0,8294.0,8136.0,120.0,280.0,252.0,291.0,201.0,0.0,0.0,1.0,0.0
2,3725912,12,03-01-2002,237.0,95.76,121.6,83.6,34.86,8252.0,8304.0,8146.0,236.0,292.0,234.0,270.0,207.0,0.0,0.0,1.0,0.0
3,3726012,12,04-01-2002,286.0,101.08,129.2,83.6,34.03,8146.0,8249.0,8092.0,204.0,284.0,228.0,264.0,201.0,0.0,0.0,1.0,0.0
4,3726112,12,05-01-2002,281.0,63.08,83.6,45.6,4.98,8341.0,8376.0,8259.0,144.0,316.0,237.0,279.0,213.0,0.0,0.0,1.0,0.0


In [7]:
def prepare_data(df, is_train):
    df['Average_Breeze_Speed'].fillna(np.mean(df['Average_Breeze_Speed'].dropna()), inplace=True)
    df['Min_Breeze_Speed'].fillna(np.mean(df['Min_Breeze_Speed'].dropna()), inplace=True)
    df['Max_Breeze_Speed'].fillna(np.mean(df['Max_Breeze_Speed'].dropna()), inplace=True)
    df['Direction_Of_Wind'].fillna(np.mean(df['Direction_Of_Wind'].dropna()), inplace=True)
    df['Average_Wind_Speed'] = np.abs(df['Average_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
    df['Max_Wind_Speed'] = np.abs(df['Max_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
    df['Min_Wind_Speed'] = np.abs(df['Min_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
#     df_locations = pd.get_dummies(df['Location_Type'], prefix='location')
#     df = pd.concat([df.drop(['Location_Type'], axis=1), df_locations], axis=1)
    df['Average_Atmospheric_Pressure'].fillna(np.mean(df['Average_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Max_Atmospheric_Pressure'].fillna(np.mean(df['Max_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Min_Atmospheric_Pressure'].fillna(np.mean(df['Min_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Average_Moisture_In_Park'].fillna(np.mean(df['Average_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Moisture_In_Park'].fillna(np.mean(df['Max_Moisture_In_Park'].dropna()), inplace=True)
    df['Min_Moisture_In_Park'].fillna(np.mean(df['Min_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Ambient_Pollution'].fillna(np.mean(df['Max_Ambient_Pollution'].dropna()), inplace=True)
    df['Min_Ambient_Pollution'].fillna(np.mean(df['Min_Ambient_Pollution'].dropna()), inplace=True)
    df['Var1'].fillna(np.mean(df['Var1'].dropna()), inplace=True)
    df['Var1'] = np.log(1 + df['Var1'])
#     df['month_bucket'] = 'low'
#     df.loc[((df['month'] == 3) | (df['month'] == 9)), 'month_bucket'] = 'medium'
#     df.loc[((df['month'] > 3) & (df['month'] < 9)), 'month_bucket'] = 'high'
#     train_month_bucket_dummies = pd.get_dummies(df['month_bucket'], prefix='month_bucket')
#     df = pd.concat([df.drop(['month'], axis=1), train_month_bucket_dummies], axis=1)
    if is_train:
        return df.drop(['ID', 'Footfall', 'Date'], axis=1), df['Footfall']
    return df.drop(['ID', 'Date'], axis=1)

In [8]:
train_features, train_target = prepare_data(train_data, 1)

In [9]:
test_features = prepare_data(test_data, 0)

In [10]:
train_features.isnull().any()

Park_ID                         False
Direction_Of_Wind               False
Average_Breeze_Speed            False
Max_Breeze_Speed                False
Min_Breeze_Speed                False
Var1                            False
Average_Atmospheric_Pressure    False
Max_Atmospheric_Pressure        False
Min_Atmospheric_Pressure        False
Min_Ambient_Pollution           False
Max_Ambient_Pollution           False
Average_Moisture_In_Park        False
Max_Moisture_In_Park            False
Min_Moisture_In_Park            False
location_1                      False
location_2                      False
location_3                      False
location_4                      False
Average_Wind_Speed              False
Max_Wind_Speed                  False
Min_Wind_Speed                  False
dtype: bool

In [11]:
train_features.head()

Unnamed: 0,Park_ID,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,...,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,location_1,location_2,location_3,location_4,Average_Wind_Speed,Max_Wind_Speed,Min_Wind_Speed
0,12,194.0,37.24,60.8,15.2,4.533996,8225.0,8259.0,8211.0,92.0,...,255.0,288.0,222.0,0.0,0.0,1.0,0.0,36.133813,58.99398,14.748495
1,12,285.0,32.68,60.8,7.6,2.715357,8232.0,8280.0,8205.0,172.0,...,252.0,297.0,204.0,0.0,0.0,1.0,0.0,8.458206,15.736198,1.967025
2,12,319.0,43.32,60.8,15.2,3.602504,8321.0,8355.0,8283.0,236.0,...,219.0,279.0,165.0,0.0,0.0,1.0,0.0,32.694019,45.886342,11.471586
3,12,297.0,25.84,38.0,7.6,0.024595,8379.0,8396.0,8358.0,272.0,...,225.0,261.0,192.0,0.0,0.0,1.0,0.0,11.731115,17.251639,3.450328
4,12,207.0,28.88,45.6,7.6,0.604316,8372.0,8393.0,8335.0,236.0,...,234.0,273.0,183.0,0.0,0.0,1.0,0.0,25.732268,40.629898,6.77165


In [12]:
train_target.head()

0    1406
1    1409
2    1386
3    1365
4    1413
Name: Footfall, dtype: int64

In [13]:
train_data['Date'] = pd.to_datetime(train_data['Date'], format='%d-%m-%Y')
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%d-%m-%Y')

In [14]:
train_data['day'] = train_data['Date'].dt.day
test_data['day'] = test_data['Date'].dt.day
train_data['month'] = train_data['Date'].dt.month
test_data['month'] = test_data['Date'].dt.month
train_data['weekday'] = train_data['Date'].dt.dayofweek
test_data['weekday'] = test_data['Date'].dt.dayofweek

In [15]:
train_features['weekday'] = train_data['Date'].dt.dayofweek
test_features['weekday'] = test_data['Date'].dt.dayofweek
train_features['month'] = abs(train_data['month'] - 7)
test_features['month'] = abs(test_data['month'] - 7)

In [16]:
train_features

Unnamed: 0,Park_ID,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,...,Min_Moisture_In_Park,location_1,location_2,location_3,location_4,Average_Wind_Speed,Max_Wind_Speed,Min_Wind_Speed,weekday,month
0,12,194.0,37.24,60.8,15.2,4.533996,8225.000000,8259.000000,8211.00000,92.000000,...,222.0,0.0,0.0,1.0,0.0,36.133813,58.993980,14.748495,5,2
1,12,285.0,32.68,60.8,7.6,2.715357,8232.000000,8280.000000,8205.00000,172.000000,...,204.0,0.0,0.0,1.0,0.0,8.458206,15.736198,1.967025,6,2
2,12,319.0,43.32,60.8,15.2,3.602504,8321.000000,8355.000000,8283.00000,236.000000,...,165.0,0.0,0.0,1.0,0.0,32.694019,45.886342,11.471586,0,2
3,12,297.0,25.84,38.0,7.6,0.024595,8379.000000,8396.000000,8358.00000,272.000000,...,192.0,0.0,0.0,1.0,0.0,11.731115,17.251639,3.450328,1,2
4,12,207.0,28.88,45.6,7.6,0.604316,8372.000000,8393.000000,8335.00000,236.000000,...,183.0,0.0,0.0,1.0,0.0,25.732268,40.629898,6.771650,2,2
5,12,243.0,74.48,106.4,45.6,4.902530,8263.000000,8331.000000,8232.00000,140.000000,...,192.0,0.0,0.0,1.0,0.0,33.813212,48.304589,20.701967,3,2
6,12,241.0,57.76,68.4,38.0,3.285787,8311.000000,8331.000000,8263.00000,132.000000,...,204.0,0.0,0.0,1.0,0.0,28.002604,33.160978,18.422766,4,2
7,12,227.0,34.20,68.4,15.2,3.991204,8352.000000,8376.000000,8324.00000,88.000000,...,270.0,0.0,0.0,1.0,0.0,23.324344,46.648688,10.366375,5,2
8,12,211.0,17.48,30.4,7.6,0.024595,8400.000000,8413.000000,8376.00000,152.000000,...,231.0,0.0,0.0,1.0,0.0,14.983284,26.057886,6.514471,6,2
9,12,221.0,24.32,53.2,7.6,0.024595,8393.000000,8410.000000,8386.00000,16.000000,...,267.0,0.0,0.0,1.0,0.0,18.354537,40.150550,5.735793,0,2


In [17]:
test_features

Unnamed: 0,Park_ID,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,...,Min_Moisture_In_Park,location_1,location_2,location_3,location_4,Average_Wind_Speed,Max_Wind_Speed,Min_Wind_Speed,weekday,month
0,12,233.0,55.48,76.0,38.0,0.024595,8259.000000,8300.000000,8211.000000,260.000000,...,210.0,0.0,0.0,1.0,0.0,33.388697,45.737942,22.868971,1,6
1,12,211.0,108.68,152.0,60.8,5.045874,8208.000000,8294.000000,8136.000000,120.000000,...,201.0,0.0,0.0,1.0,0.0,93.156942,130.289430,52.115772,2,6
2,12,237.0,95.76,121.6,83.6,3.579622,8252.000000,8304.000000,8146.000000,236.000000,...,207.0,0.0,0.0,1.0,0.0,52.154634,66.228107,45.531823,3,6
3,12,286.0,101.08,129.2,83.6,3.556205,8146.000000,8249.000000,8092.000000,204.000000,...,201.0,0.0,0.0,1.0,0.0,27.861424,35.612346,23.043283,4,6
4,12,281.0,63.08,83.6,45.6,1.788421,8341.000000,8376.000000,8259.000000,144.000000,...,213.0,0.0,0.0,1.0,0.0,12.036231,15.951632,8.700890,5,6
5,12,269.0,69.92,76.0,60.8,2.768832,8321.000000,8345.000000,8294.000000,220.000000,...,228.0,0.0,0.0,1.0,0.0,1.220272,1.326383,1.061106,6,6
6,12,261.0,47.12,68.4,15.2,0.024595,8379.000000,8413.000000,8345.000000,240.000000,...,234.0,0.0,0.0,1.0,0.0,7.371192,10.700117,2.377804,0,6
7,12,245.0,44.08,60.8,22.8,2.315501,8417.000000,8454.000000,8400.000000,96.000000,...,258.0,0.0,0.0,1.0,0.0,18.629013,25.695190,9.635696,1,6
8,12,224.0,33.44,45.6,15.2,0.000000,8458.000000,8468.000000,8451.000000,252.000000,...,243.0,0.0,0.0,1.0,0.0,24.054723,32.801895,10.933965,2,6
9,12,212.0,47.88,60.8,38.0,2.033398,8447.000000,8458.000000,8420.000000,216.000000,...,270.0,0.0,0.0,1.0,0.0,40.604543,51.561324,32.225828,3,6


In [18]:
train_features.columns == test_features.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True], dtype=bool)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [23]:
# Tuning hyper-parameters for RMSE

# Set the parameters by cross-validation
# n_range = range(100, 150, 50)
tuned_parameters = {'n_estimators': [10]}
# random_state_range = range(11)
# tuned_parameters = {'n_estimators': n_range, 'random_state': random_state_range}

rf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
rf.fit(train_features, train_target)

print rf.grid_scores_
print 'Average Error = ' + str(np.sqrt(np.abs(rf.grid_scores_[0][1])))
print 'Min Error = ' + str(np.sqrt(np.abs(rf.best_score_)))
print 'Best parameters = '
print rf.best_params_
print rf.best_estimator_

[mean: -8904.42208, std: 727.58033, params: {'n_estimators': 10}]
Average Error = 94.3632453669
Min Error = 94.3632453669
Best parameters = 
{'n_estimators': 10}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [24]:
predicted = rf.predict(test_features)

In [25]:
solution = pd.DataFrame(test_data['ID'])

In [26]:
solution = pd.concat([solution, pd.DataFrame(predicted, columns=['Footfall'])], axis=1)

In [27]:
solution

Unnamed: 0,ID,Footfall
0,3725712,1093.2
1,3725812,1106.3
2,3725912,1121.6
3,3726012,1030.2
4,3726112,993.7
5,3726212,1078.0
6,3726312,1013.5
7,3726412,1103.4
8,3726512,1041.4
9,3726612,1034.2


In [28]:
solution.to_csv('solution94.csv', index=False)