In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn import tree
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

**Load Data**

In [4]:
test_set = pd.read_csv("./input/Test.csv")
train_set = pd.read_csv("./input/Train.csv")

In [5]:
print(train_set.head())

             date_time is_holiday  air_pollution_index  humidity  wind_speed  \
0  2012-10-02 09:00:00       None                  121        89           2   
1  2012-10-02 10:00:00       None                  178        67           3   
2  2012-10-02 11:00:00       None                  113        66           3   
3  2012-10-02 12:00:00       None                   20        66           3   
4  2012-10-02 13:00:00       None                  281        65           3   

   wind_direction  visibility_in_miles  dew_point  temperature  rain_p_h  \
0             329                    1          1       288.28       0.0   
1             330                    1          1       289.36       0.0   
2             329                    2          2       289.58       0.0   
3             329                    5          5       290.13       0.0   
4             329                    7          7       291.14       0.0   

   snow_p_h  clouds_all weather_type weather_description  traf

**Prepare Train Data**

In [6]:
train_set['date_time'] = pd.to_datetime(train_set.date_time)

In [7]:
train_set['year'] = train_set.date_time.dt.year
train_set['month'] = train_set.date_time.dt.month
train_set['day'] = train_set.date_time.dt.day
train_set['hour'] = train_set.date_time.dt.hour

In [8]:
train_copy = train_set.drop(['date_time'], axis=1)
print(train_copy.head())

  is_holiday  air_pollution_index  humidity  wind_speed  wind_direction  \
0       None                  121        89           2             329   
1       None                  178        67           3             330   
2       None                  113        66           3             329   
3       None                   20        66           3             329   
4       None                  281        65           3             329   

   visibility_in_miles  dew_point  temperature  rain_p_h  snow_p_h  \
0                    1          1       288.28       0.0       0.0   
1                    1          1       289.36       0.0       0.0   
2                    2          2       289.58       0.0       0.0   
3                    5          5       290.13       0.0       0.0   
4                    7          7       291.14       0.0       0.0   

   clouds_all weather_type weather_description  traffic_volume  year  month  \
0          40       Clouds    scattered clouds   

**One-hot-encoding**

In [9]:
train_onehot = train_copy.copy()

In [10]:
train_onehot = pd.get_dummies(train_onehot, columns=['is_holiday', 'weather_type', 'weather_description'], 
                              prefix=['is_holiday', 'weather_type', 'weather_desc'])

In [11]:
train_onehot = train_onehot.astype(float)
train_onehot.head()

Unnamed: 0,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,...,weather_desc_sleet,weather_desc_smoke,weather_desc_snow,weather_desc_thunderstorm,weather_desc_thunderstorm with drizzle,weather_desc_thunderstorm with heavy rain,weather_desc_thunderstorm with light drizzle,weather_desc_thunderstorm with light rain,weather_desc_thunderstorm with rain,weather_desc_very heavy rain
0,121.0,89.0,2.0,329.0,1.0,1.0,288.28,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,178.0,67.0,3.0,330.0,1.0,1.0,289.36,0.0,0.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,113.0,66.0,3.0,329.0,2.0,2.0,289.58,0.0,0.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.0,66.0,3.0,329.0,5.0,5.0,290.13,0.0,0.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,281.0,65.0,3.0,329.0,7.0,7.0,291.14,0.0,0.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Preparing the Test data**

In [12]:
test_set['date_time'] = pd.to_datetime(test_set.date_time)

In [13]:
test_set['year'] = test_set.date_time.dt.year
test_set['month'] = test_set.date_time.dt.month
test_set['day'] = test_set.date_time.dt.day
test_set['hour'] = test_set.date_time.dt.hour

In [14]:
test_copy = test_set.drop(['date_time'], axis=1)

In [15]:
test_onehot = test_copy.copy()
test_onehot = pd.get_dummies(test_onehot, columns=['is_holiday', 'weather_type', 'weather_description'], 
                              prefix=['is_holiday', 'weather_type', 'weather_desc'])

In [16]:
print(len(train_onehot.columns))
print(len(train_onehot))

76
33750


In [17]:
print(len(test_onehot.columns))
print(len(test_onehot))

68
14454


In [18]:
# find the features that are not in test data set
for x in train_onehot.columns:
    if x not in test_onehot.columns and x != 'traffic_volume':
        print(x)
        test_onehot[x] = train_onehot[x]

weather_type_Squall
weather_desc_SQUALLS
weather_desc_freezing rain
weather_desc_light rain and snow
weather_desc_shower snow
weather_desc_thunderstorm with drizzle
weather_desc_very heavy rain


In [19]:
test_onehot = test_onehot.astype(float)

**Train and Test data**

In [20]:
y_train = train_onehot['traffic_volume']
x_train = train_onehot.drop(['traffic_volume'], axis=1)

**Training**

In [21]:
dec_tree_reg = tree.DecisionTreeRegressor()
dec_tree_reg.fit(x_train, y_train)

**Decision Tree**

In [24]:
# preds = dec_tree_reg.predict(test_onehot)
preds = dec_tree_reg.predict(x_train)
print(len(preds))
print(preds)
preds = preds.astype(int)
print(preds)

33750
[5545. 4516. 4767. ... 2194. 1328. 1328.]
[5545 4516 4767 ... 2194 1328 1328]


In [None]:
# submission = pd.DataFrame(columns = ['date_time', 'traffic_volume']) 
# print(submission.head())
# submission.date_time = test_set.date_time
# submission.traffic_volume = preds
# print(len(submission))
# print(submission.head())
# submission.to_csv('dtreereg_final_prediction_submission.csv', index=False)

Empty DataFrame
Columns: [date_time, traffic_volume]
Index: []
14454
            date_time  traffic_volume
0 2017-05-18 00:00:00             582
1 2017-05-18 00:00:00             582
2 2017-05-18 00:00:00             582
3 2017-05-18 01:00:00             355
4 2017-05-18 01:00:00             870


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
