In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [2]:
# Load training data
train_data = pd.read_csv('Dataset/Train.csv')
# Load testing data
test_data = pd.read_csv('Dataset/Test.csv')

In [3]:
data_locations = pd.get_dummies(train_data['Location_Type'], prefix='location')

train_data = pd.concat([train_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [4]:
train_data.head()

Unnamed: 0,ID,Park_ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,...,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,Footfall,location_1,location_2,location_3,location_4
0,3311712,12,01-09-1990,194.0,37.24,60.8,15.2,92.13,8225.0,8259.0,...,92.0,304.0,255.0,288.0,222.0,1406,0.0,0.0,1.0,0.0
1,3311812,12,02-09-1990,285.0,32.68,60.8,7.6,14.11,8232.0,8280.0,...,172.0,332.0,252.0,297.0,204.0,1409,0.0,0.0,1.0,0.0
2,3311912,12,03-09-1990,319.0,43.32,60.8,15.2,35.69,8321.0,8355.0,...,236.0,292.0,219.0,279.0,165.0,1386,0.0,0.0,1.0,0.0
3,3312012,12,04-09-1990,297.0,25.84,38.0,7.6,0.0249,8379.0,8396.0,...,272.0,324.0,225.0,261.0,192.0,1365,0.0,0.0,1.0,0.0
4,3312112,12,05-09-1990,207.0,28.88,45.6,7.6,0.83,8372.0,8393.0,...,236.0,332.0,234.0,273.0,183.0,1413,0.0,0.0,1.0,0.0


In [6]:
data_locations = pd.get_dummies(test_data['Location_Type'], prefix='location')

test_data = pd.concat([test_data.drop(['Location_Type'], axis=1), data_locations], axis=1)

In [7]:
test_data.head()

Unnamed: 0,ID,Park_ID,Date,Direction_Of_Wind,Average_Breeze_Speed,Max_Breeze_Speed,Min_Breeze_Speed,Var1,Average_Atmospheric_Pressure,Max_Atmospheric_Pressure,Min_Atmospheric_Pressure,Min_Ambient_Pollution,Max_Ambient_Pollution,Average_Moisture_In_Park,Max_Moisture_In_Park,Min_Moisture_In_Park,location_1,location_2,location_3,location_4
0,3725712,12,01-01-2002,233.0,55.48,76.0,38.0,0.0249,8259.0,8300.0,8211.0,260.0,316.0,243.0,285.0,210.0,0.0,0.0,1.0,0.0
1,3725812,12,02-01-2002,211.0,108.68,152.0,60.8,154.38,8208.0,8294.0,8136.0,120.0,280.0,252.0,291.0,201.0,0.0,0.0,1.0,0.0
2,3725912,12,03-01-2002,237.0,95.76,121.6,83.6,34.86,8252.0,8304.0,8146.0,236.0,292.0,234.0,270.0,207.0,0.0,0.0,1.0,0.0
3,3726012,12,04-01-2002,286.0,101.08,129.2,83.6,34.03,8146.0,8249.0,8092.0,204.0,284.0,228.0,264.0,201.0,0.0,0.0,1.0,0.0
4,3726112,12,05-01-2002,281.0,63.08,83.6,45.6,4.98,8341.0,8376.0,8259.0,144.0,316.0,237.0,279.0,213.0,0.0,0.0,1.0,0.0


In [8]:
def prepare_data(df, is_train):
    df['Average_Breeze_Speed'].fillna(np.mean(df['Average_Breeze_Speed'].dropna()), inplace=True)
    df['Min_Breeze_Speed'].fillna(np.mean(df['Min_Breeze_Speed'].dropna()), inplace=True)
    df['Max_Breeze_Speed'].fillna(np.mean(df['Max_Breeze_Speed'].dropna()), inplace=True)
    df['Direction_Of_Wind'].fillna(np.mean(df['Direction_Of_Wind'].dropna()), inplace=True)
    df['Average_Wind_Speed'] = np.abs(df['Average_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
    df['Max_Wind_Speed'] = np.abs(df['Max_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
    df['Min_Wind_Speed'] = np.abs(df['Min_Breeze_Speed']*np.cos((df['Direction_Of_Wind'] % 180)*(np.pi/180)))
#     df_locations = pd.get_dummies(df['Location_Type'], prefix='location')
#     df = pd.concat([df.drop(['Location_Type'], axis=1), df_locations], axis=1)
    df['Average_Atmospheric_Pressure'].fillna(np.mean(df['Average_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Max_Atmospheric_Pressure'].fillna(np.mean(df['Max_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Min_Atmospheric_Pressure'].fillna(np.mean(df['Min_Atmospheric_Pressure'].dropna()), inplace=True)
    df['Average_Moisture_In_Park'].fillna(np.mean(df['Average_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Moisture_In_Park'].fillna(np.mean(df['Max_Moisture_In_Park'].dropna()), inplace=True)
    df['Min_Moisture_In_Park'].fillna(np.mean(df['Min_Moisture_In_Park'].dropna()), inplace=True)
    df['Max_Ambient_Pollution'].fillna(np.mean(df['Max_Ambient_Pollution'].dropna()), inplace=True)
    df['Min_Ambient_Pollution'].fillna(np.mean(df['Min_Ambient_Pollution'].dropna()), inplace=True)
    df['Var1'].fillna(np.mean(df['Var1'].dropna()), inplace=True)
    df['Var1'] = np.log(1 + df['Var1'])
    df['year'] = df['Date'].apply(extract_year)
    df['month'] = df['Date'].apply(extract_month)
#     df['month_bucket'] = 'low'
#     df.loc[((df['month'] == 3) | (df['month'] == 9)), 'month_bucket'] = 'medium'
#     df.loc[((df['month'] > 3) & (df['month'] < 9)), 'month_bucket'] = 'high'
#     train_month_bucket_dummies = pd.get_dummies(df['month_bucket'], prefix='month_bucket')
#     df = pd.concat([df.drop(['month'], axis=1), train_month_bucket_dummies], axis=1)
    if is_train:
        return df.drop(['ID', 'Footfall', 'Date'], axis=1), df['Footfall']
    return df.drop(['ID', 'Date'], axis=1)

In [9]:
train_features, train_target = prepare_data(train_data, 1)

KeyError: 'Location_Type'