## SVM Model

In [1]:
print("Importing Libraries...")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import time
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math
from sklearn.metrics import r2_score
from sklearn.utils import shuffle
import ast
print("Successfully imported!")

Importing Libraries...
Successfully imported!


### Prepare dataset for fitting in model

In [2]:
print("Reading Datafile...")
checkpoint5 = pd.read_csv('checkpoint-5.csv')
print("Success!")

Reading Datafile...
Success!


In [3]:
print("Reading index of train data...") 
with open("SVM Model/train_idx.txt") as file:
    train_idx_str = file.read()
    file.close()

train_idx = ast.literal_eval(train_idx_str)
print("Length of the train indexes is...", len(train_idx))
print(train_idx[0])

Reading index of train data...
Length of the train indexes is... 540000
17094116


In [4]:
print("Getting dataframe of train set...")
train_set = checkpoint5.loc[train_idx]
checkpoint5.drop(axis=0, index=train_idx,inplace=True)
print("Seperated dataframe for both train and test")

Getting dataframe of train set...
Seperated dataframe for both train and test


In [5]:
print("Obtaining labels for train set...")
cols = 'fare_amount'
y_train = train_set[[cols]]
print("Obtained labels!")
del train_set['time']
del train_set['fare_amount']
del train_set['pickup_longitude']
del train_set['pickup_latitude']
del train_set['dropoff_longitude']
del train_set['dropoff_latitude']
print("Train set is ready!")

Obtaining labels for train set...
Obtained labels!
Train set is ready!


In [6]:
X_train = train_set

In [7]:
X_train

Unnamed: 0,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist
17094116,1,3,False,0,0,1,0.696993
53649197,1,5,True,0,0,0,1.594699
38088232,5,1,False,0,1,0,3.154141
29785639,1,2,False,0,0,0,2.620846
24305630,1,1,False,0,0,0,6.734819
...,...,...,...,...,...,...,...
12888383,1,6,True,0,0,1,4.731678
38936098,1,3,False,0,1,0,3.661563
189012,2,6,True,0,0,0,4.128879
37063756,5,3,False,0,1,0,18.904935


In [8]:
y_train

Unnamed: 0,fare_amount
17094116,4.50
53649197,4.90
38088232,10.90
29785639,7.30
24305630,27.00
...,...
12888383,14.50
38936098,11.70
189012,6.10
37063756,57.54


### Test set preparation

In [9]:
print("Reading Datafile...")
test_set = pd.read_csv('processed_test.csv')
print("Success!")
test_key_series = test_set['key'].squeeze()

Reading Datafile...
Success!


In [10]:
print("Removing extra columns...")
del test_set['key']
del test_set['time']
del test_set['pickup_longitude']
del test_set['pickup_latitude']
del test_set['dropoff_longitude']
del test_set['dropoff_latitude']
print("Test set is ready!")

Removing extra columns...
Test set is ready!


In [11]:
X_test = test_set

In [12]:
X_test

Unnamed: 0,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist
0,1,1,False,0,0,0,2.323260
1,1,1,False,0,0,0,2.425353
2,1,5,True,0,0,0,0.618628
3,1,5,True,0,0,0,1.961033
4,1,5,True,0,0,1,5.387301
...,...,...,...,...,...,...,...
9909,6,6,True,0,0,0,2.124874
9910,6,0,False,0,1,0,3.270969
9911,6,6,True,0,0,0,19.183941
9912,6,5,True,0,0,0,8.343486


#### Load Model

In [13]:
with open('SVM Model/Dist_SVM_Experiment_540000_v2.pkl', 'rb') as f:

    regressor = pickle.load(f)

In [14]:
print("Fitting...")
X_sc = StandardScaler()
y_sc = StandardScaler()
X_train = X_sc.fit_transform(X_train)
y_train = y_sc.fit_transform(y_train)

Fitting...


In [15]:
print('Predicting...')
t1 = time.time()
y_pred = regressor.predict(X_sc.transform(X_test))
t2 = time.time()
print("Inversely Transforming value data ")
y_pred = y_sc.inverse_transform(y_pred.reshape(-1, 1))
print('Predicted all!')
print(y_pred)

Predicting...
Inversely Transforming value data 
Predicted all!
[[ 8.43326717]
 [ 8.66309583]
 [ 4.49796604]
 ...
 [41.98811168]
 [21.51579096]
 [ 6.67586009]]


In [17]:
new_y_pred= []
for a in y_pred: 
    new_value = float("%.2f" % a)
    new_y_pred.append(new_value)
y_pred_series = pd.Series(new_y_pred)
df = pd.DataFrame({'key': test_key_series,'fare_amount': new_y_pred})
df
df.to_csv('WithDist_SVM/OnTest/Dist_predictions_SVR540000_TESTSET2.csv',index=False )

In [21]:
print("Reading Datafile...")
prediction = pd.read_csv('WithDist_SVM/OnTest/Dist_predictions_SVR540000_TESTSET2.csv')
print("Success!")

Reading Datafile...
Success!


In [22]:
prediction

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.43
1,2015-01-27 13:08:24.0000003,8.66
2,2011-10-08 11:53:44.0000002,4.50
3,2012-12-01 21:12:12.0000002,7.50
4,2012-12-01 21:12:12.0000003,15.19
...,...,...
9909,2015-05-10 12:37:51.0000002,8.34
9910,2015-01-12 17:05:51.0000001,10.87
9911,2015-04-19 20:44:15.0000001,41.99
9912,2015-01-31 01:05:19.0000005,21.52
