In [43]:
import pandas as pd
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch

In [2]:
df = pd.read_csv("NYCTaxiFares.csv")

In [3]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [4]:
df["fare_amount"].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

In [5]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [7]:
df["dist_km"] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [11]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


In [13]:
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [15]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [16]:
df["EDTdate"] = df["pickup_datetime"] - pd.Timedelta(hours=4)

In [17]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00


In [18]:
df["hour"] = df["EDTdate"].dt.hour

In [19]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,hour
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22


In [20]:
df["AMorPM"] = np.where(df["hour"]<12,"am", "pm")

In [22]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,hour,AMorPM
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm


In [23]:
df["day_week"] =df["EDTdate"].dt.strftime("%a")

In [24]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,hour,AMorPM,day_week
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am,Mon
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am,Sat
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am,Sat
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm,Sun
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm,Fri


In [26]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'hour', 'AMorPM', 'day_week'],
      dtype='object')

In [27]:
cat_cols = ["hour", "AMorPM", "day_week"]
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km']

In [28]:
y_col = ["fare_amount"]

In [29]:
for cat in cat_cols:
    df[cat] = df[cat].astype("category")

In [56]:
df["AMorPM"].cat.categories

Index(['am', 'pm'], dtype='object')

In [35]:
hour = df["hour"].cat.codes.values
ampm = df["AMorPM"].cat.codes.values
day = df["day_week"].cat.codes.values

In [82]:
cats = torch.tensor(np.stack([df[cat].cat.codes.values for cat in cat_cols],1), dtype=torch.long) #the clean way

In [46]:
cont = torch.tensor(np.stack([df[cont].values for cont in cont_cols],1), dtype=torch.float)

In [47]:
y = torch.tensor(df[y_col].values, dtype = torch.float)

In [83]:
cats.shape

torch.Size([120000, 3])

In [84]:
cat_szs = [len(df[cat].cat.categories) for cat in cat_cols]

In [85]:
cat_szs

[24, 2, 7]

The following line is needed to calculate the type of embeddings needed for categorical data <br>
1- The size will be the number of categories in that categorical column  
2- The dimension will the shape of the embedding vector that would represent one categorical datapoint
    IT is typically best to keep the dimension between 50 and half of the number of categories, whichever is lesser

In [106]:
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]

In [107]:
emb_szs #so basically now for the hour categorical column each data point would get a 12D vecctor to be its representation

[(24, 12), (2, 1), (7, 4)]

In [108]:
selfembds = nn.ModuleList([nn.Embedding(ne,ed) for ne,ed in emb_szs]) #Creating an embedding layer using pytorch that would generate the emb dim sized vector

In [109]:
selfembds #using module list we have made it into a dictionary

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [90]:
catz = cats[:2]

So now we are passing each column of our categorical data to its appropriate embedding layer <br>

0th embedding layer will pass all the values from the hour category column <br>
1st embedding layer will pass all the values from the ampm category column, so on and so forth

In [115]:
embeddings = []
for i, e in enumerate(selfembds):
    embeddings.append(e(catz[:,i])) 

In [116]:
embeddingz = torch.cat(embeddings, dim=1)

In [117]:
embeddings

[tensor([[-0.0551,  0.2449, -0.1025,  0.6085,  1.1398,  0.5803,  0.6779,  1.4399,
          -0.7706,  2.0437,  1.0690,  1.1453],
         [-0.4048, -0.8287,  0.6378,  1.2623,  0.7730,  0.3743,  0.1104,  0.4551,
           0.9689, -0.5004, -0.4547,  0.6672]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.3029],
         [-0.3029]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 1.0326,  1.5676, -0.0500,  2.2006],
         [ 0.6140,  0.8286,  1.8116, -1.5039]], grad_fn=<EmbeddingBackward0>)]

In [118]:
embeddingz

tensor([[-0.0551,  0.2449, -0.1025,  0.6085,  1.1398,  0.5803,  0.6779,  1.4399,
         -0.7706,  2.0437,  1.0690,  1.1453, -0.3029,  1.0326,  1.5676, -0.0500,
          2.2006],
        [-0.4048, -0.8287,  0.6378,  1.2623,  0.7730,  0.3743,  0.1104,  0.4551,
          0.9689, -0.5004, -0.4547,  0.6672, -0.3029,  0.6140,  0.8286,  1.8116,
         -1.5039]], grad_fn=<CatBackward0>)

In [119]:
embeddrop = nn.Dropout(0.4)

In [120]:
z = embeddrop(embeddingz)

In [121]:
z

tensor([[-0.0919,  0.0000, -0.0000,  1.0141,  0.0000,  0.9672,  1.1298,  2.3999,
         -1.2844,  3.4062,  1.7816,  1.9088, -0.0000,  0.0000,  0.0000, -0.0000,
          3.6677],
        [-0.0000, -0.0000,  1.0630,  0.0000,  0.0000,  0.0000,  0.1840,  0.0000,
          1.6149, -0.0000, -0.0000,  1.1120, -0.0000,  1.0233,  1.3810,  0.0000,
         -2.5065]], grad_fn=<MulBackward0>)

In [123]:
cont.shape

torch.Size([120000, 6])

In [None]:
class TabularModel(nn.Module):
    """
    emb_szs = (no_of_categories, embedding dimension) to convert categorical to embeddings
    n_cont = number of continuous features
    out_sz = our desired output dimension
    layers = list of number of neurons per layer from 2nd to l-1
            ex : [100,200,300] 
            100 is the output of the fcn block and the input for the second fcn block
            200 is the input of the second fcn block and input for the third, so on and so forth
            This allows dynamic construction of the architecture based on user input
    """
    
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.4):
        super().__init__()

        self.embeds = nn.ModuleList([nn.Embedding(ne,ed) for ne,ed in emb_szs])
        self.embdrop = nn.Dropout(p)
        self.normcont = nn.BatchNorm1d(n_cont)

        layerlist = []

        n_cat = sum([nf, for ne, nf in emb_szs])
        n_in = n_cat + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i

        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):

        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:i]))

        x = torch.cat(embeddings,1)
        x = self.embdrop(x)

        x_cont = self.normcont(x_cont)
        x = torch.cat([x, x_cont])

        x = self.layers(x)

        return x