In [77]:
import numpy as np
import pandas as pd

In [78]:
df = pd.read_csv('traffic_volume.csv')
print(df.head())
print(df.shape)
print(((df.isnull().sum())*100)/len(df))

  holiday    temp  rain  snow weather        date      Time  traffic_volume
0     NaN  288.28   0.0   0.0  Clouds  02-10-2012  09:00:00            5545
1     NaN  289.36   0.0   0.0  Clouds  02-10-2012  10:00:00            4516
2     NaN  289.58   0.0   0.0  Clouds  02-10-2012  11:00:00            4767
3     NaN  290.13   0.0   0.0  Clouds  02-10-2012  12:00:00            5026
4     NaN  291.14   0.0   0.0  Clouds  02-10-2012  13:00:00            4918
(48204, 8)
holiday           99.873454
temp               0.109949
rain               0.004149
snow               0.024894
weather            0.101651
date               0.000000
Time               0.000000
traffic_volume     0.000000
dtype: float64


In [79]:
# Delete column 'holiday'
# delete the rows wit null values in 'temp', 'rain', 'snow', 'weather'
#--------------Handling NUll values-------------

In [80]:
df=df.drop(columns=['holiday'], axis=1)
df.dropna(inplace=True)
print(df.shape)
print(df.isnull().sum())

(48090, 7)
temp              0
rain              0
snow              0
weather           0
date              0
Time              0
traffic_volume    0
dtype: int64


In [81]:
#--------------Handling data Inconsistency-------------

In [82]:
df['Time'] = pd.to_datetime(df['Time']).dt.hour
df.head()

  df['Time'] = pd.to_datetime(df['Time']).dt.hour


Unnamed: 0,temp,rain,snow,weather,date,Time,traffic_volume
0,288.28,0.0,0.0,Clouds,02-10-2012,9,5545
1,289.36,0.0,0.0,Clouds,02-10-2012,10,4516
2,289.58,0.0,0.0,Clouds,02-10-2012,11,4767
3,290.13,0.0,0.0,Clouds,02-10-2012,12,5026
4,291.14,0.0,0.0,Clouds,02-10-2012,13,4918


In [83]:
#--------------Handling unusual entry or noise-------------

In [84]:
print(df.dtypes)
print(df['weather'].unique())
print((df['temp']==0).sum())
df.drop(df[df['temp'] < 100].index, inplace=True)

temp              float64
rain              float64
snow              float64
weather            object
date               object
Time                int32
traffic_volume      int64
dtype: object
['Clouds' 'Clear' 'Rain' 'Drizzle' 'Mist' 'Haze' 'Fog' 'Thunderstorm'
 'Snow' 'Squall' 'Smoke']
10


In [85]:
#--------------Handling dublicates-------------

In [86]:
df[df.duplicated()]
df.drop_duplicates(inplace=True)
print(df[df.duplicated()])

Empty DataFrame
Columns: [temp, rain, snow, weather, date, Time, traffic_volume]
Index: []


In [87]:
## ----------Handling outliers--------------------

In [88]:
df.shape

(47849, 7)

In [89]:
q1 = df['traffic_volume'].quantile(0.1)  
q2 = df['traffic_volume'].quantile(0.9)  
df = df[(df['traffic_volume'] >= q1) & (df['traffic_volume'] <= q2)]
print(df.shape)

(38290, 7)


In [90]:
df.describe()

Unnamed: 0,temp,rain,snow,Time,traffic_volume
count,38290.0,38290.0,38290.0,38290.0,38290.0
mean,281.319167,0.387881,0.000253,12.416584,3252.662993
std,12.876505,50.25127,0.008965,6.806973,1653.061734
min,243.39,0.0,0.0,0.0,425.0
25%,272.08,0.0,0.0,7.0,1805.25
50%,282.55,0.0,0.0,12.0,3379.0
75%,292.15,0.0,0.0,19.0,4721.0
max,310.07,9831.3,0.51,23.0,5820.0


In [91]:
# df.to_csv('cleaned_traffic_volume.csv', index=False)

In [92]:
#-------------Model Transfromation------------------------------------------------------------------------------------------------------------------

In [93]:
print(df.shape)
print(df.head())

(38290, 7)
     temp  rain  snow weather        date  Time  traffic_volume
0  288.28   0.0   0.0  Clouds  02-10-2012     9            5545
1  289.36   0.0   0.0  Clouds  02-10-2012    10            4516
2  289.58   0.0   0.0  Clouds  02-10-2012    11            4767
3  290.13   0.0   0.0  Clouds  02-10-2012    12            5026
4  291.14   0.0   0.0  Clouds  02-10-2012    13            4918


In [94]:
#-----------handling imbalance data------------

In [95]:
print(((df['rain']==0).sum())*100/len(df))
print(((df['snow']==0).sum())*100/len(df))

92.836249673544
99.85897101070776


In [96]:
#delete column 'snow' as it has 99% of data as zero
df = df.drop(columns=['snow'], axis=1)

In [97]:
#-----------ading features------------

In [98]:
def add_features(df):
    df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['week_day'] = df['date'].dt.day_name()
    return df.drop(columns = ['date'], axis = 1)

In [99]:
df = add_features(df)
df

Unnamed: 0,temp,rain,weather,Time,traffic_volume,year,month,day,week_day
0,288.28,0.0,Clouds,9,5545,2012,10,2,Tuesday
1,289.36,0.0,Clouds,10,4516,2012,10,2,Tuesday
2,289.58,0.0,Clouds,11,4767,2012,10,2,Tuesday
3,290.13,0.0,Clouds,12,5026,2012,10,2,Tuesday
4,291.14,0.0,Clouds,13,4918,2012,10,2,Tuesday
...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,Clouds,19,3543,2018,9,30,Sunday
48200,282.76,0.0,Clouds,20,2781,2018,9,30,Sunday
48201,282.73,0.0,Thunderstorm,21,2159,2018,9,30,Sunday
48202,282.09,0.0,Clouds,22,1450,2018,9,30,Sunday


In [100]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.weather = le.fit_transform(df.weather)

In [101]:
# df.to_csv('transformed_traffic_volume.csv', index=False)

In [102]:
# -------------------------------Model Training---------------------------------------------------------------------------------------------------

In [103]:
print(df.shape)
print(df.head())

(38290, 9)
     temp  rain  weather  Time  traffic_volume  year  month  day week_day
0  288.28   0.0        1     9            5545  2012     10    2  Tuesday
1  289.36   0.0        1    10            4516  2012     10    2  Tuesday
2  289.58   0.0        1    11            4767  2012     10    2  Tuesday
3  290.13   0.0        1    12            5026  2012     10    2  Tuesday
4  291.14   0.0        1    13            4918  2012     10    2  Tuesday


In [104]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.week_day = le.fit_transform(df.week_day)
df

Unnamed: 0,temp,rain,weather,Time,traffic_volume,year,month,day,week_day
0,288.28,0.0,1,9,5545,2012,10,2,5
1,289.36,0.0,1,10,4516,2012,10,2,5
2,289.58,0.0,1,11,4767,2012,10,2,5
3,290.13,0.0,1,12,5026,2012,10,2,5
4,291.14,0.0,1,13,4918,2012,10,2,5
...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,1,19,3543,2018,9,30,3
48200,282.76,0.0,1,20,2781,2018,9,30,3
48201,282.73,0.0,10,21,2159,2018,9,30,3
48202,282.09,0.0,1,22,1450,2018,9,30,3


In [105]:
#------------seperating values----------
y_value = df.iloc[:,4]
x_value = df.drop(columns=['traffic_volume', 'temp', 'year'], axis=1)

In [106]:
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
x_scaled=scale.fit_transform(x_value)

In [107]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_value, test_size=0.2)

In [108]:
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(x_train, y_train)

In [109]:
y_pred=model.predict(x_test)
from sklearn.metrics import mean_squared_log_error, r2_score
r2_score(y_test, y_pred)

0.9572213828048725

In [110]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print("Mean Square Error: ", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("R-square Score: ", r2_score(y_test, y_pred))

Mean Square Error:  116152.15317451526
Mean Absolute Error:  229.31313263327
R-square Score:  0.9572213828048725


In [111]:
#--------------Hyperparameters----------------------------------------------------------------------------------------------------------

In [112]:
hyp_pars=model.get_params()
print(hyp_pars)

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [118]:
from sklearn.model_selection import GridSearchCV
parameters={
    'max_depth': [3, 5, 8],
    'min_child_weight': [1, 3, 5],
    'eta': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1]
}
clf = GridSearchCV(xgb.XGBRegressor(), param_grid=parameters,verbose=2, cv=5)

clf.fit(x_train,y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.6; total time=   0.2s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, eta=0.1, max_depth=3, min_child_weight=1, subsamp

In [122]:
y_pred=clf.predict(x_test)
print("Best Score: ", r2_score(y_test, y_pred))
clf.best_params_

Best Score:  0.9676877994811365


{'colsample_bytree': 1,
 'eta': 0.3,
 'max_depth': 8,
 'min_child_weight': 1,
 'subsample': 1}

In [123]:
y_pred=clf.predict(x_test)
print("Mean Square Error: ", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("R-square Score: ", r2_score(y_test, y_pred))

Mean Square Error:  87733.82381560856
Mean Absolute Error:  199.1112719822503
R-square Score:  0.9676877994811365


In [None]:
model=xgb.XGBRegressor()

parameters={
    'max_depth': [3, 5, 8],
    'min_child_weight': [1, 3, 5],
    'eta': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1]
}