In [1]:
import pandas as pd
from xgboost import XGBRegressor
import numpy as np

In [2]:
train_df = pd.read_csv("DataSets/Train.csv")
test_df = pd.read_csv("DataSets/Test.csv")

In [3]:
train_df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918


## Data Cleaning

In [4]:
train_df.isnull().sum()

date_time              0
is_holiday             0
air_pollution_index    0
humidity               0
wind_speed             0
wind_direction         0
visibility_in_miles    0
dew_point              0
temperature            0
rain_p_h               0
snow_p_h               0
clouds_all             0
weather_type           0
weather_description    0
traffic_volume         0
dtype: int64

In [5]:
test_df.isnull().sum()

date_time              0
is_holiday             0
air_pollution_index    0
humidity               0
wind_speed             0
wind_direction         0
visibility_in_miles    0
dew_point              0
temperature            0
rain_p_h               0
snow_p_h               0
clouds_all             0
weather_type           0
weather_description    0
dtype: int64

## is holiday

In [6]:
train_df['is_holiday'].value_counts()

None                         33707
New Years Day                    5
Christmas Day                    5
Thanksgiving Day                 5
Columbus Day                     4
Washingtons Birthday             4
Labor Day                        4
Veterans Day                     4
Independence Day                 3
Martin Luther King Jr Day        3
Memorial Day                     3
State Fair                       3
Name: is_holiday, dtype: int64

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33750 entries, 0 to 33749
Data columns (total 15 columns):
date_time              33750 non-null object
is_holiday             33750 non-null object
air_pollution_index    33750 non-null int64
humidity               33750 non-null int64
wind_speed             33750 non-null int64
wind_direction         33750 non-null int64
visibility_in_miles    33750 non-null int64
dew_point              33750 non-null int64
temperature            33750 non-null float64
rain_p_h               33750 non-null float64
snow_p_h               33750 non-null float64
clouds_all             33750 non-null int64
weather_type           33750 non-null object
weather_description    33750 non-null object
traffic_volume         33750 non-null int64
dtypes: float64(3), int64(8), object(4)
memory usage: 3.9+ MB


In [8]:
# # get dummies for is_holiday attribute and drop original column is_holiday
is_holiday_dummies = pd.get_dummies(train_df['is_holiday'])
train_df = pd.concat([train_df, is_holiday_dummies],axis=1)
train_df.drop('is_holiday', axis=1,inplace=True)

In [9]:
# # get dummies for is_holiday attribute and drop original column is_holiday from test dataset
is_holiday_dummies_test = pd.get_dummies(test_df['is_holiday'])
test_df = pd.concat([test_df, is_holiday_dummies_test],axis=1)
test_df.drop('is_holiday', axis=1,inplace=True)

In [10]:
is_holiday_dummies_test.head()

Unnamed: 0,Christmas Day,Columbus Day,Independence Day,Labor Day,Martin Luther King Jr Day,Memorial Day,New Years Day,None,State Fair,Thanksgiving Day,Veterans Day,Washingtons Birthday
0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0


In [11]:
is_holiday_dummies.head()

Unnamed: 0,Christmas Day,Columbus Day,Independence Day,Labor Day,Martin Luther King Jr Day,Memorial Day,New Years Day,None,State Fair,Thanksgiving Day,Veterans Day,Washingtons Birthday
0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
train_df.head()

Unnamed: 0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,Independence Day,Labor Day,Martin Luther King Jr Day,Memorial Day,New Years Day,None,State Fair,Thanksgiving Day,Veterans Day,Washingtons Birthday
0,2012-10-02 09:00:00,121,89,2,329,1,1,288.28,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,2012-10-02 10:00:00,178,67,3,330,1,1,289.36,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,2012-10-02 11:00:00,113,66,3,329,2,2,289.58,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3,2012-10-02 12:00:00,20,66,3,329,5,5,290.13,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
4,2012-10-02 13:00:00,281,65,3,329,7,7,291.14,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
train_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'weather_type',
       'weather_description', 'traffic_volume', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Jr Day', 'Memorial Day', 'New Years Day', 'None',
       'State Fair', 'Thanksgiving Day', 'Veterans Day',
       'Washingtons Birthday'],
      dtype='object')

## weather description

In [14]:
train_df['weather_description'].value_counts()

sky is clear                           7524
mist                                   4047
overcast clouds                        3972
broken clouds                          3379
light rain                             2442
scattered clouds                       2294
Sky is Clear                           1709
few clouds                             1568
light snow                             1280
moderate rain                          1105
haze                                    902
light intensity drizzle                 703
heavy snow                              587
fog                                     567
drizzle                                 460
proximity thunderstorm                  345
heavy intensity rain                    311
snow                                    190
thunderstorm                             66
proximity shower rain                    54
thunderstorm with heavy rain             42
heavy intensity drizzle                  40
thunderstorm with light rain    

In [15]:
train_df['weather_description'].head()

0    scattered clouds
1       broken clouds
2     overcast clouds
3     overcast clouds
4       broken clouds
Name: weather_description, dtype: object

In [16]:
def get_weather_type_matrix(weather_description_series):
    df = pd.DataFrame(data=np.zeros((len(weather_description_series), 13)), columns=['clear_sky',
                                                                                     'mist',
                                                                                     'intensity',
                                                                                     'cloud_condition',                                                                                    
                                                                                     'no. of clouds',                                                                                     
                                                                                     'drizzle',
                                                                                     'snow',
                                                                                     'fog',
                                                                                     'thunderstorm',
                                                                                     'rain',
                                                                                     'smoke',
                                                                                     'shower',
                                                                                     'squalls'])
    for i in range(len(weather_description_series)):
        if 'clear sky' in weather_description_series[i]:
            df.iloc[i][0] = 1
        if 'mist' in weather_description_series[i]:
            df.iloc[i][1] = 1
        if 'overcast' in weather_description_series[i] or 'broken' in weather_description_series[i]:
            df.iloc[i][2] = 1
        if 'light' in weather_description_series[i] or \
            'heavy' in weather_description_series[i] or \
            'very heavy' in weather_description_series[i]:
            df.iloc[i][3]= 1        
        if 'few' in weather_description_series[i]:
            df.iloc[i][4] =1
        if 'drizzle' in weather_description_series[i]:
            df.iloc[i][5] =1
        if 'snow' in weather_description_series[i]:
            df.iloc[i][6] = 1
        if 'fog' in weather_description_series[i]:
            df.iloc[i][7] = 1
        if 'thunderstorm' in weather_description_series[i]:
            df.iloc[i][8] = 1
        if 'rain' in weather_description_series[i]:
            df.iloc[i][9] = 1
        if 'smoke' in weather_description_series[i]:
            df.iloc[i][10] = 1
        if 'shower' in weather_description_series[i]:
            df.iloc[i][11] = 1
        if 'squalls' in weather_description_series[i]:
            df.iloc[i][12] = 1
    return df

In [17]:
train_df['weather_description'] = train_df['weather_description'].apply(lambda x: x.lower())

In [18]:
test_df['weather_description'] = test_df['weather_description'].apply(lambda x: x.lower())

In [19]:
train_df['weather_description'].value_counts()

sky is clear                           9233
mist                                   4047
overcast clouds                        3972
broken clouds                          3379
light rain                             2442
scattered clouds                       2294
few clouds                             1568
light snow                             1280
moderate rain                          1105
haze                                    902
light intensity drizzle                 703
heavy snow                              587
fog                                     567
drizzle                                 460
proximity thunderstorm                  345
heavy intensity rain                    311
snow                                    190
thunderstorm                             66
proximity shower rain                    54
thunderstorm with heavy rain             42
heavy intensity drizzle                  40
thunderstorm with light rain             35
proximity thunderstorm with rain

In [20]:
train_df['weather_description'].head()

0    scattered clouds
1       broken clouds
2     overcast clouds
3     overcast clouds
4       broken clouds
Name: weather_description, dtype: object

In [21]:
train_weather_description_matrix = get_weather_type_matrix(train_df['weather_description'])

In [22]:
test_weather_description_matrix = get_weather_type_matrix(test_df['weather_description'])

In [23]:
train_weather_description_matrix.head()

Unnamed: 0,clear_sky,mist,intensity,cloud_condition,no. of clouds,drizzle,snow,fog,thunderstorm,rain,smoke,shower,squalls
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
print(train_df.index)

RangeIndex(start=0, stop=33750, step=1)


In [25]:
# merge weather description matrix with train df
train_df = pd.concat([train_df, train_weather_description_matrix],axis=1)
train_df.drop("weather_description", axis=1, inplace=True)

In [26]:
# merge weather description matrix with test df
test_df = pd.concat([test_df, test_weather_description_matrix],axis=1)
test_df.drop("weather_description", axis=1, inplace=True)

In [27]:
train_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'weather_type', 'traffic_volume',
       'Christmas Day', 'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Jr Day', 'Memorial Day', 'New Years Day', 'None',
       'State Fair', 'Thanksgiving Day', 'Veterans Day',
       'Washingtons Birthday', 'clear_sky', 'mist', 'intensity',
       'cloud_condition', 'no. of clouds', 'drizzle', 'snow', 'fog',
       'thunderstorm', 'rain', 'smoke', 'shower', 'squalls'],
      dtype='object')

In [28]:
# # # get dummies for is_holiday attribute and drop original column is_holiday
# weather_des_dummies = pd.get_dummies(train_df['weather_description'])
# train_df = pd.concat([train_df, weather_des_dummies ], axis=1)
# train_df.drop('weather_description', axis=1,inplace=True)

In [29]:
# # # get dummies for is_holiday attribute and drop original column is_holiday for test dataset
# weather_des_dummies_test = pd.get_dummies(test_df['weather_description'])
# test_df = pd.concat([test_df, weather_des_dummies_test ], axis=1)
# test_df.drop('weather_description', axis=1,inplace=True)

In [30]:
test_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'weather_type', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Jr Day', 'Memorial Day', 'New Years Day', 'None',
       'State Fair', 'Thanksgiving Day', 'Veterans Day',
       'Washingtons Birthday', 'clear_sky', 'mist', 'intensity',
       'cloud_condition', 'no. of clouds', 'drizzle', 'snow', 'fog',
       'thunderstorm', 'rain', 'smoke', 'shower', 'squalls'],
      dtype='object')

In [31]:
train_df.head()

Unnamed: 0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,cloud_condition,no. of clouds,drizzle,snow,fog,thunderstorm,rain,smoke,shower,squalls
0,2012-10-02 09:00:00,121,89,2,329,1,1,288.28,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2012-10-02 10:00:00,178,67,3,330,1,1,289.36,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2012-10-02 11:00:00,113,66,3,329,2,2,289.58,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2012-10-02 12:00:00,20,66,3,329,5,5,290.13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2012-10-02 13:00:00,281,65,3,329,7,7,291.14,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
test_df.head()

Unnamed: 0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,cloud_condition,no. of clouds,drizzle,snow,fog,thunderstorm,rain,smoke,shower,squalls
0,2017-05-18 00:00:00,73,63,1,27,4,4,285.15,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2017-05-18 00:00:00,251,63,1,27,4,4,285.15,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-05-18 00:00:00,75,56,1,0,1,1,285.15,0.0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-05-18 01:00:00,98,56,1,351,2,2,284.79,0.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2017-05-18 01:00:00,283,56,1,351,1,1,284.79,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## weather type

In [33]:
train_df['weather_type'].value_counts()

Clouds          11213
Clear            9233
Mist             4047
Rain             3940
Snow             2070
Drizzle          1204
Haze              902
Fog               567
Thunderstorm      553
Smoke              17
Squall              4
Name: weather_type, dtype: int64

In [34]:
test_df['weather_type'].value_counts()

Clear           4158
Clouds          3951
Mist            1903
Rain            1732
Snow             806
Drizzle          617
Thunderstorm     481
Haze             458
Fog              345
Smoke              3
Name: weather_type, dtype: int64

In [35]:
weather_type_dummies = pd.get_dummies(train_df['weather_type'])
train_df = pd.concat([train_df, weather_type_dummies ], axis=1)
train_df.drop('weather_type', axis=1,inplace=True)

In [36]:
train_df.head()

Unnamed: 0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,Clouds,Drizzle,Fog,Haze,Mist,Rain,Smoke,Snow,Squall,Thunderstorm
0,2012-10-02 09:00:00,121,89,2,329,1,1,288.28,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,2012-10-02 10:00:00,178,67,3,330,1,1,289.36,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2012-10-02 11:00:00,113,66,3,329,2,2,289.58,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,2012-10-02 12:00:00,20,66,3,329,5,5,290.13,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,2012-10-02 13:00:00,281,65,3,329,7,7,291.14,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [37]:
train_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Jr Day', 'Memorial Day', 'New Years Day', 'None',
       'State Fair', 'Thanksgiving Day', 'Veterans Day',
       'Washingtons Birthday', 'clear_sky', 'mist', 'intensity',
       'cloud_condition', 'no. of clouds', 'drizzle', 'snow', 'fog',
       'thunderstorm', 'rain', 'smoke', 'shower', 'squalls', 'Clear', 'Clouds',
       'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke', 'Snow', 'Squall',
       'Thunderstorm'],
      dtype='object')

In [38]:
weather_type_dummies_test = pd.get_dummies(test_df['weather_type'])
test_df = pd.concat([test_df, weather_type_dummies_test ], axis=1)
test_df.drop('weather_type', axis=1,inplace=True)

In [39]:
test_df.insert(len(test_df.columns)-1, "Squall",0.0)

In [40]:
test_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'Christmas Day', 'Columbus Day',
       'Independence Day', 'Labor Day', 'Martin Luther King Jr Day',
       'Memorial Day', 'New Years Day', 'None', 'State Fair',
       'Thanksgiving Day', 'Veterans Day', 'Washingtons Birthday', 'clear_sky',
       'mist', 'intensity', 'cloud_condition', 'no. of clouds', 'drizzle',
       'snow', 'fog', 'thunderstorm', 'rain', 'smoke', 'shower', 'squalls',
       'Clear', 'Clouds', 'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke',
       'Snow', 'Squall', 'Thunderstorm'],
      dtype='object')

In [41]:
test_df.head()

Unnamed: 0,date_time,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,...,Clouds,Drizzle,Fog,Haze,Mist,Rain,Smoke,Snow,Squall,Thunderstorm
0,2017-05-18 00:00:00,73,63,1,27,4,4,285.15,0.0,0,...,0,0,0,0,0,1,0,0,0.0,0
1,2017-05-18 00:00:00,251,63,1,27,4,4,285.15,0.0,0,...,0,0,0,0,1,0,0,0,0.0,0
2,2017-05-18 00:00:00,75,56,1,0,1,1,285.15,0.0,0,...,0,1,0,0,0,0,0,0,0.0,0
3,2017-05-18 01:00:00,98,56,1,351,2,2,284.79,0.0,0,...,0,0,0,0,0,1,0,0,0.0,0
4,2017-05-18 01:00:00,283,56,1,351,1,1,284.79,0.0,0,...,0,0,0,0,1,0,0,0,0.0,0


In [42]:
## create train and test df without date time column
train_df_2 = train_df.drop('date_time',axis=1)
test_df_2 = test_df.drop('date_time',axis=1)

In [43]:
train_df_2.columns

Index(['air_pollution_index', 'humidity', 'wind_speed', 'wind_direction',
       'visibility_in_miles', 'dew_point', 'temperature', 'rain_p_h',
       'snow_p_h', 'clouds_all', 'traffic_volume', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Jr Day', 'Memorial Day', 'New Years Day', 'None',
       'State Fair', 'Thanksgiving Day', 'Veterans Day',
       'Washingtons Birthday', 'clear_sky', 'mist', 'intensity',
       'cloud_condition', 'no. of clouds', 'drizzle', 'snow', 'fog',
       'thunderstorm', 'rain', 'smoke', 'shower', 'squalls', 'Clear', 'Clouds',
       'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke', 'Snow', 'Squall',
       'Thunderstorm'],
      dtype='object')

In [44]:
train_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33750 entries, 0 to 33749
Data columns (total 47 columns):
air_pollution_index          33750 non-null int64
humidity                     33750 non-null int64
wind_speed                   33750 non-null int64
wind_direction               33750 non-null int64
visibility_in_miles          33750 non-null int64
dew_point                    33750 non-null int64
temperature                  33750 non-null float64
rain_p_h                     33750 non-null float64
snow_p_h                     33750 non-null float64
clouds_all                   33750 non-null int64
traffic_volume               33750 non-null int64
Christmas Day                33750 non-null uint8
Columbus Day                 33750 non-null uint8
Independence Day             33750 non-null uint8
Labor Day                    33750 non-null uint8
Martin Luther King Jr Day    33750 non-null uint8
Memorial Day                 33750 non-null uint8
New Years Day                33750 

In [45]:
test_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14454 entries, 0 to 14453
Data columns (total 46 columns):
air_pollution_index          14454 non-null int64
humidity                     14454 non-null int64
wind_speed                   14454 non-null int64
wind_direction               14454 non-null int64
visibility_in_miles          14454 non-null int64
dew_point                    14454 non-null int64
temperature                  14454 non-null float64
rain_p_h                     14454 non-null float64
snow_p_h                     14454 non-null int64
clouds_all                   14454 non-null int64
Christmas Day                14454 non-null uint8
Columbus Day                 14454 non-null uint8
Independence Day             14454 non-null uint8
Labor Day                    14454 non-null uint8
Martin Luther King Jr Day    14454 non-null uint8
Memorial Day                 14454 non-null uint8
New Years Day                14454 non-null uint8
None                         14454 no

In [46]:
test_df_2['Squall'] = 0

In [47]:
## split dataset into train_df2_x, train_df2_y
train_df_2_x = train_df_2.drop('traffic_volume',axis=1)
train_df_2_y = train_df_2['traffic_volume']

In [48]:
# ## split test dataset into train_df2_x, train_df2_y
# test_df_2_x = test_df_2.drop('traffic_volume',axis=1)
# test_df_2_y = test_df_2['traffic_volume']

In [49]:
train_df_2_x.columns

Index(['air_pollution_index', 'humidity', 'wind_speed', 'wind_direction',
       'visibility_in_miles', 'dew_point', 'temperature', 'rain_p_h',
       'snow_p_h', 'clouds_all', 'Christmas Day', 'Columbus Day',
       'Independence Day', 'Labor Day', 'Martin Luther King Jr Day',
       'Memorial Day', 'New Years Day', 'None', 'State Fair',
       'Thanksgiving Day', 'Veterans Day', 'Washingtons Birthday', 'clear_sky',
       'mist', 'intensity', 'cloud_condition', 'no. of clouds', 'drizzle',
       'snow', 'fog', 'thunderstorm', 'rain', 'smoke', 'shower', 'squalls',
       'Clear', 'Clouds', 'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke',
       'Snow', 'Squall', 'Thunderstorm'],
      dtype='object')

In [50]:
test_df_2.columns

Index(['air_pollution_index', 'humidity', 'wind_speed', 'wind_direction',
       'visibility_in_miles', 'dew_point', 'temperature', 'rain_p_h',
       'snow_p_h', 'clouds_all', 'Christmas Day', 'Columbus Day',
       'Independence Day', 'Labor Day', 'Martin Luther King Jr Day',
       'Memorial Day', 'New Years Day', 'None', 'State Fair',
       'Thanksgiving Day', 'Veterans Day', 'Washingtons Birthday', 'clear_sky',
       'mist', 'intensity', 'cloud_condition', 'no. of clouds', 'drizzle',
       'snow', 'fog', 'thunderstorm', 'rain', 'smoke', 'shower', 'squalls',
       'Clear', 'Clouds', 'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke',
       'Snow', 'Squall', 'Thunderstorm'],
      dtype='object')

## scale values of training and testing dataset

In [51]:
from sklearn.preprocessing import MinMaxScaler

In [52]:
scaler = MinMaxScaler()
train_df_2_x = scaler.fit_transform(train_df_2_x)
test_df_2 = scaler.transform(test_df_2)

## splitting dataset into training and validation sets

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
train_x, val_x, train_y, val_y = train_test_split(train_df_2_x, train_df_2_y)

In [55]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(25312, 46)
(25312,)
(8438, 46)
(8438,)


In [56]:
train_y = train_y.values.reshape(-1,1)
val_y = val_y.values.reshape(-1,1)

In [57]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(25312, 46)
(25312, 1)
(8438, 46)
(8438, 1)


In [58]:
model = XGBRegressor(learning_rate=0.001, n_estimators=1000, n_jobs=-1, max_depth=3, verbosity=1)
model.fit(train_x, train_y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.01, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [59]:
val_y_predicted =model.predict(val_x)

In [60]:
from sklearn import metrics

In [61]:
print("mean squared error", metrics.mean_squared_error(val_y_predicted, val_y))

mean squared error 3554268.6545330193


In [62]:
# get predictions on test_x
test_y_predicted = model.predict(test_df_2)

In [63]:
result_df=  pd.DataFrame()

In [64]:
test_df.columns

Index(['date_time', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'Christmas Day', 'Columbus Day',
       'Independence Day', 'Labor Day', 'Martin Luther King Jr Day',
       'Memorial Day', 'New Years Day', 'None', 'State Fair',
       'Thanksgiving Day', 'Veterans Day', 'Washingtons Birthday', 'clear_sky',
       'mist', 'intensity', 'cloud_condition', 'no. of clouds', 'drizzle',
       'snow', 'fog', 'thunderstorm', 'rain', 'smoke', 'shower', 'squalls',
       'Clear', 'Clouds', 'Drizzle', 'Fog', 'Haze', 'Mist', 'Rain', 'Smoke',
       'Snow', 'Squall', 'Thunderstorm'],
      dtype='object')

In [65]:
#result_df = pd.concat([test_df['date_time'], pd.Series(test_y_predicted, dtype=pd.Int32Dtype).apply(lambda x: round(x))], axis=1)
result_df = pd.concat([test_df['date_time'], pd.Series(test_y_predicted, dtype=pd.Int32Dtype)], axis=1)

In [66]:
result_df.head()

Unnamed: 0,date_time,0
0,2017-05-18 00:00:00,3337.93
1,2017-05-18 00:00:00,3170.24
2,2017-05-18 00:00:00,3269.22
3,2017-05-18 01:00:00,3638.76
4,2017-05-18 01:00:00,3201.06


In [67]:
result_df.columns = ['date_time','traffic_volume']

In [68]:
result_df.to_csv("results.csv", index=False)