In [1]:
import pandas as pd
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch
import time

In [2]:
df = pd.read_csv("NYCTaxiFares.csv")

In [3]:
df["fare_amount"].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

In [4]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [5]:
df["dist_km"] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


In [7]:
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [8]:
df["EDTdate"] = df["pickup_datetime"] - pd.Timedelta(hours=4)

In [9]:
df["hour"] = df["EDTdate"].dt.hour

In [10]:
df["AMorPM"] = np.where(df["hour"]<12,"am", "pm")

In [11]:
df["day_week"] =df["EDTdate"].dt.strftime("%a")

In [12]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'hour', 'AMorPM', 'day_week'],
      dtype='object')

In [13]:
cat_cols = ["hour", "AMorPM", "day_week"]
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km']

In [14]:
y_col = ["fare_amount"]

In [15]:
for cat in cat_cols:
    df[cat] = df[cat].astype("category")

In [16]:
df["AMorPM"].cat.categories

Index(['am', 'pm'], dtype='object')

In [17]:
hour = df["hour"].cat.codes.values
ampm = df["AMorPM"].cat.codes.values
day = df["day_week"].cat.codes.values

In [18]:
cats = torch.tensor(np.stack([df[cat].cat.codes.values for cat in cat_cols],1), dtype=torch.long) #the clean way

In [19]:
cont = torch.tensor(np.stack([df[cont].values for cont in cont_cols],1), dtype=torch.float)

In [20]:
y = torch.tensor(df[y_col].values, dtype = torch.float)

In [21]:
cats.shape

torch.Size([120000, 3])

In [22]:
cat_szs = [len(df[cat].cat.categories) for cat in cat_cols]

In [23]:
cat_szs

[24, 2, 7]

The following line is needed to calculate the type of embeddings needed for categorical data <br>
1- The size will be the number of categories in that categorical column  
2- The dimension will the shape of the embedding vector that would represent one categorical datapoint
    IT is typically best to keep the dimension between 50 and half of the number of categories, whichever is lesser

In [24]:
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]

In [25]:
emb_szs #so basically now for the hour categorical column each data point would get a 12D vecctor to be its representation

[(24, 12), (2, 1), (7, 4)]

In [26]:
selfembds = nn.ModuleList([nn.Embedding(ne,ed) for ne,ed in emb_szs]) #Creating an embedding layer using pytorch that would generate the emb dim sized vector

In [27]:
selfembds #using module list we have made it into a dictionary

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [28]:
catz = cats[:2]

So now we are passing each column of our categorical data to its appropriate embedding layer <br>

0th embedding layer will pass all the values from the hour category column <br>
1st embedding layer will pass all the values from the ampm category column, so on and so forth

In [29]:
embeddings = []
for i, e in enumerate(selfembds):
    embeddings.append(e(catz[:,i])) 

In [30]:
embeddingz = torch.cat(embeddings, dim=1)

In [31]:
cont.shape

torch.Size([120000, 6])

In [32]:
class TabularModel(nn.Module):
    """
    emb_szs = (no_of_categories, embedding dimension) to convert categorical to embeddings
    n_cont = number of continuous features
    out_sz = our desired output dimension
    layers = list of number of neurons per layer from 2nd to l-1
            ex : [100,200,300] 
            100 is the output of the fcn block and the input for the second fcn block
            200 is the input of the second fcn block and input for the third, so on and so forth
            This allows dynamic construction of the architecture based on user input
    """
    
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()

        self.embeds = nn.ModuleList([nn.Embedding(ne,ed) for ne,ed in emb_szs])
        self.embdrop = nn.Dropout(p)
        self.normcont = nn.BatchNorm1d(n_cont)

        layerlist = []

        n_cat = sum([nf for ne, nf in emb_szs])
        n_in = n_cat + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i

        layerlist.append(nn.Linear(layers[-1], out_sz))
        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):

        embeddings = []
        
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))

        x = torch.cat(embeddings,1)
        x = self.embdrop(x)

        x_cont = self.normcont(x_cont)
        x = torch.cat([x, x_cont], 1)

        x = self.layers(x)

        return x

In [33]:
torch.manual_seed(33)
model = TabularModel(emb_szs, cont.shape[1], 1, [200,100], 0.5)

In [34]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (embdrop): Dropout(p=0.5, inplace=False)
  (normcont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [35]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [36]:
batch_size = 60000
test_size = int(batch_size*0.2)

In [37]:
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
cont_train = cont[:batch_size-test_size]
cont_test = cont[batch_size-test_size:batch_size]

In [38]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [39]:
start_time = time.time()
epochs = 300
losses = []

for i in range(epochs):

    y_pred = model.forward(cat_train, cont_train)
    loss = torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss.item())

    if i%10 == 0:
        print(f"Epoch: {i}, Loss: {loss}")

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f"Training Took {(time.time() - start_time) / 60} mins")

Epoch: 0, Loss: 12.589123725891113
Epoch: 10, Loss: 11.792752265930176
Epoch: 20, Loss: 11.22496223449707
Epoch: 30, Loss: 10.832974433898926
Epoch: 40, Loss: 10.534828186035156
Epoch: 50, Loss: 10.300067901611328
Epoch: 60, Loss: 10.087516784667969
Epoch: 70, Loss: 9.905826568603516
Epoch: 80, Loss: 9.689477920532227
Epoch: 90, Loss: 9.486383438110352
Epoch: 100, Loss: 9.237733840942383
Epoch: 110, Loss: 8.958498001098633
Epoch: 120, Loss: 8.639734268188477
Epoch: 130, Loss: 8.294915199279785
Epoch: 140, Loss: 7.885173797607422
Epoch: 150, Loss: 7.464734077453613
Epoch: 160, Loss: 7.020777225494385
Epoch: 170, Loss: 6.585127353668213
Epoch: 180, Loss: 6.103859901428223
Epoch: 190, Loss: 5.648401737213135
Epoch: 200, Loss: 5.218247413635254
Epoch: 210, Loss: 4.8539276123046875
Epoch: 220, Loss: 4.600926876068115
Epoch: 230, Loss: 4.393165111541748
Epoch: 240, Loss: 4.184859752655029
Epoch: 250, Loss: 4.173018455505371
Epoch: 260, Loss: 4.100866317749023
Epoch: 270, Loss: 4.035979747772

In [40]:
with torch.no_grad():
    y_val = model.forward(cat_test, cont_test)
    loss = torch.sqrt(criterion(y_val, y_test))

In [41]:
loss

tensor(3.8815)

In [42]:
print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(50):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

   PREDICTED   ACTUAL     DIFF
 1.   4.9072   2.9000   2.0072
 2.  15.5492   5.7000   9.8492
 3.   6.8949   7.7000   0.8051
 4.  15.7628  12.5000   3.2628
 5.   6.2467   4.1000   2.1467
 6.   3.5000   5.3000   1.8000
 7.   2.5522   3.7000   1.1478
 8.  21.8729  14.5000   7.3729
 9.   2.1319   5.7000   3.5681
10.  12.3366  10.1000   2.2366
11.   7.5209   4.5000   3.0209
12.   3.6354   6.1000   2.4646
13.   6.3583   6.9000   0.5417
14.   9.2390  14.1000   4.8610
15.   6.3309   4.5000   1.8309
16.  32.4334  34.1000   1.6666
17.   1.6804  12.5000  10.8196
18.   3.9392   4.1000   0.1608
19.   7.9634   8.5000   0.5366
20.   3.0906   5.3000   2.2094
21.  13.0110  11.3000   1.7110
22.  12.1937  10.5000   1.6937
23.  15.7102  15.3000   0.4102
24.  17.1444  14.9000   2.2444
25.  42.2721  49.5700   7.2979
26.   3.2670   5.3000   2.0330
27.   4.1070   3.7000   0.4070
28.   6.3457   6.5000   0.1543
29.  14.8165  14.1000   0.7165
30.   2.0575   4.9000   2.8425
31.   3.6641   3.7000   0.0359
32.  26.

In [43]:
torch.save(model, "taxifare.pt")