In [1]:
# Import Required Python Packages :

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier

# Date & Time Libraries :
from datetime import datetime
import time

np.random.seed(0)

In [2]:
train = pd.read_csv('train_ML_IOT.csv')
test = pd.read_csv('test_ML_IOT.csv')
ss = pd.read_csv('sample_submission_ML_IOT.csv')

In [3]:
# Dropping "ID" Columns in Train and Test :

train.drop(["ID"],axis = 1,inplace=True)
test_ID = test["ID"]
test.drop(["ID"],axis = 1,inplace=True)

In [4]:
def datetounix1(df):
    # Initialising unixtime list
    unixtime = []
    
    # Running a loop for converting Date to seconds
    for date in df['DateTime']:
        unixtime.append(time.mktime(date.timetuple()))
    
    # Replacing Date with unixtime list
    df['DateTime'] = unixtime
    return(df)

In [5]:
# Converting to datetime :
train['DateTime'] = pd.to_datetime(train['DateTime'])
test['DateTime'] = pd.to_datetime(test['DateTime'])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11808 entries, 0 to 11807
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  11808 non-null  datetime64[ns]
 1   Junction  11808 non-null  int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 184.6 KB


In [6]:
# Creating features from DateTime for train data

train['Weekday'] = [datetime.weekday(date) for date in train.DateTime]
train['Day'] = [date.day for date in train.DateTime]
train['Time'] = [((date.hour*60+(date.minute))*60)+date.second for date in train.DateTime]
train['Week'] = [date.week for date in train.DateTime]

# Creating features from DateTime for test data
test['Weekday'] = [datetime.weekday(date) for date in test.DateTime]
test['Day'] = [date.day for date in test.DateTime]
test['Time'] = [((date.hour*60+(date.minute))*60)+date.second for date in test.DateTime]
test['Week'] = [date.week for date in test.DateTime]

In [7]:
train_features = datetounix1(train.drop(['Vehicles'], axis=1))
test_features = datetounix1(test)


# Store Features / Predictors in array :
X = train_features  
X_valid = test_features

# One Hot Encoding - Using Dummies :
X = pd.get_dummies(X)
X_valid = pd.get_dummies(X_valid)

# Store target 'Vehicles' in y array :
y = train['Vehicles'].to_frame()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=512)

# X_train from train df
print('--'*20 ,'\n')
print("X_train.shape : ", X_train.shape)
display(X_train.columns)
print('--'*20 ,'\n')
# X_test from train df
print("X_test.shape : ", X_test.shape)
display(X_test.columns)
print('--'*20 ,'\n')
# X_valid from test df
print("X_valid.shape : ", X_valid.shape)
display(X_valid.columns)
print('--'*20 ,'\n')

---------------------------------------- 

X_train.shape :  (32240, 6)


Index(['DateTime', 'Junction', 'Weekday', 'Day', 'Time', 'Week'], dtype='object')

---------------------------------------- 

X_test.shape :  (15880, 6)


Index(['DateTime', 'Junction', 'Weekday', 'Day', 'Time', 'Week'], dtype='object')

---------------------------------------- 

X_valid.shape :  (11808, 6)


Index(['DateTime', 'Junction', 'Weekday', 'Day', 'Time', 'Week'], dtype='object')

---------------------------------------- 



In [8]:
# Data prep
final = pd.DataFrame()
final['ID'] = test_ID

# Starting time for time calculations
start_time = time.time()

clf = LGBMRegressor(boosting_type='gbdt',
                    max_depth=6,
                    learning_rate=0.015, 
                    n_estimators=80, # BESTTTT !!! Reduced 
                    reg_alpha=0.0005,
                    random_state = 512)

# 2. Fit the created Machine Learning Model on "train" data - X(Predictors) & y(Target) :
clf = clf.fit(X_train, y_train)

# 3. Predict the Target "Vehicles" for testing data - X_test(Predictors)  :
y_hat = clf.predict(X_test)

print("The time taken to execute is %s seconds" % (time.time() - start_time))

The time taken to execute is 0.1308903694152832 seconds


In [9]:
mean_squared_error(y_test, y_hat)

83.31952543311107

In [10]:
# Prepare Solution dataframe
final['Vehicles'] = clf.predict(X_valid)
display(final.head())


final.to_csv(str(datetime.today().strftime('%Y-%m-%d'))+"_FINAL.csv", index = False)

Unnamed: 0,ID,Vehicles
0,20170701001,48.530574
1,20170701011,44.994199
2,20170701021,36.461834
3,20170701031,30.972165
4,20170701041,30.972165
