Perform following steps
<ol>
    <li> Perform Feature Engineering</li>
    <ol>
        <li> calculate the distance from give latitude and longitude</li>
        <li> convert the pickup_datetime to date_time object</li>
        <li> convert the date time to newyork time </li>
        <li> get day of week, hour of the day, am/pm </li>
        
</ol>


In [2]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import time

In [4]:
#took a very small section of data 120k rows from 5.5 millions rows

df = pd.read_csv('../Data/NYCTaxiFares.csv')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [5]:
# df["dropoff_longitude"]

Calculate the Haversine Distance: istance on a sphere between two sets of GPS coordinates

${\displaystyle d=2r\arcsin \left({\sqrt {\sin ^{2}\left({\frac {\varphi _{2}-\varphi _{1}}{2}}\right)+\cos(\varphi _{1})\:\cos(\varphi _{2})\:\sin ^{2}\left({\frac {\lambda _{2}-\lambda _{1}}{2}}\right)}}\right)}$

$\varphi$ is latitude

$\lambda$ is longitude

$r$ is the radius, here earth's radius that is appox 6371 km


In [6]:
r = 6371

In [7]:
##super slow version as the apply function runs over each row seperatly --8 sec!!!!
def haversine_distance(df):
    
    phi2 = np.radians(df["dropoff_latitude"])
    phi1 = np.radians(df["pickup_latitude"])
    del_phi = np.radians(df["dropoff_latitude"] - df["pickup_latitude"])
    del_lambda = np.radians(df["dropoff_longitude"] - df["pickup_longitude"])
    
    term1 = np.sin(del_phi/2)**2 + np.cos(phi1) * np.cos(phi2) \
            * np.sin(del_lambda /2)**2
#     print(term1)
    term2 = 2 * np.arctan2(np.sqrt(term1), np.sqrt(1-term1))
    result = (r * term2)
    return result
    
start_time = time.time()
df["distance"] = df.apply(haversine_distance,axis=1)
df.head()
print(f'\nDuration: {time.time() - start_time:.0f} seconds')


Duration: 11 seconds


In [8]:
#super fast as it is doing matrix operations in numpy -> 0.02 seconds!! 400 times faster
def haversine_distance(df,drop_lat,drop_long,pick_lat,pick_long):
    
    phi2 = np.radians(df[drop_lat])
    phi1 = np.radians(df[pick_lat])
    
    del_phi = np.radians(df[drop_lat] - df[pick_lat])
    
    del_lambda = np.radians(df[drop_long] - df[pick_long])
    
    term1 = np.sin(del_phi/2)**2 + np.cos(phi1) * np.cos(phi2) \
            * np.sin(del_lambda /2)**2
#     print(term1)
    term2 = 2 * np.arctan2(np.sqrt(term1) , np.sqrt(1-term1))
    result = (r * term2)
    return result

start_time = time.time()
df["distance"] = haversine_distance(df,"dropoff_latitude","dropoff_longitude"\
                                    ,"pickup_latitude","pickup_longitude")
df.head()
print(f'\nDuration: {time.time() - start_time:.10f} seconds')


Duration: 0.0462369919 seconds


In [9]:
# convert the pickup_datetime to date_time object
df["nyc_datetime"] = pd.to_datetime(df['pickup_datetime'].str[:-3]) - pd.Timedelta(hours=4)
print(type(df["nyc_datetime"][0]))
df.head()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,nyc_datetime
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01


In [10]:
#get hours
df["Hour"] = df["nyc_datetime"].dt.hour
df.head(2)

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,nyc_datetime,Hour
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11


In [11]:
#get the am am feature

# df["AMorPM"] = np.where(df["Hour"]<12 ,"am", "pm")
#both are same
df["AMorPM"] = df["nyc_datetime"].dt.strftime("%p").str.lower()
df.head(5)

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,nyc_datetime,Hour,AMorPM
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03,17,pm
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01,22,pm


In [12]:
# get the weekday feature

df["Weekday"] = df["nyc_datetime"].dt.strftime("%a")
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,nyc_datetime,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am,Mon
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am,Sat
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am,Sat
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03,17,pm,Sun
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01,22,pm,Fri


### taking the hour, weekday and am/pm as categorical data

In [13]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance', 'nyc_datetime', 'Hour', 'AMorPM',
       'Weekday'],
      dtype='object')

In [14]:
categorical_cols = ['Hour', 'AMorPM','Weekday']
continuous_cols = ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance']
y_cols = ['fare_amount'] #final value to be predicted

### convert the categorical columns into categories

In [15]:
print(df['AMorPM'][:5],df['AMorPM'][:5].astype('category'))
df.head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: object 0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): [am, pm]


Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,nyc_datetime,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am,Mon
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am,Sat
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am,Sat
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03,17,pm,Sun
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01,22,pm,Fri


In [16]:
for cats in categorical_cols:
    df[cats]= df[cats].astype('category')

In [17]:
df["AMorPM"].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): [am, pm]

In [18]:
df["Weekday"].cat.categories ,df["Weekday"].head().cat.codes

(Index(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype='object'),
 0    1
 1    2
 2    2
 3    3
 4    0
 dtype: int8)

### combine the codes columns into one input array using np.stack, only the values

In [19]:
hr=df["Hour"].cat.codes.values
ap =df["AMorPM"].cat.codes.values
week=df["Weekday"].cat.codes.values

print(hr[:5],week[:5],ap[:5])

cats = np.stack([hr,ap,week] , axis =1) #columnwise

print(cats[:5])

cats_row = np.stack([hr,ap,week] , axis =0) #rowwise

print(cats_row[:5])

[ 4 11  7 17 22] [1 2 2 3 0] [0 0 0 1 1]
[[ 4  0  1]
 [11  0  2]
 [ 7  0  2]
 [17  1  3]
 [22  1  0]]
[[ 4 11  7 ... 14  4 12]
 [ 0  0  0 ...  1  0  1]
 [ 1  2  2 ...  3  5  2]]


### Convert the numpy arrays into tensors

In [20]:
cats = torch.tensor(cats,dtype=torch.int64)
cats[:5]

tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        [17,  1,  3],
        [22,  1,  0]])

Also convert the continues variables into torch tensors
##### It is important to store the continuous features into Float (float32) tensors and not Double (float64) in order for the batch normalization to work properly 

In [21]:
conts = np.stack([df[col] for col in continuous_cols], axis=1)
conts = torch.tensor(conts, dtype=torch.float)

conts[:5]

tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        [-73.9905,  40.7564, -73.9712,  40.7482,   1.0000,   1.8641],
        [-73.9910,  40.7342, -73.9060,  40.7431,   1.0000,   7.2313]])

In [22]:
cats.type(),conts.type(),type(cats[0].type()),cats.dtype,conts.dtype

('torch.LongTensor', 'torch.FloatTensor', str, torch.int64, torch.float32)

get torch tensors for the fare amount 

In [23]:
y=torch.tensor(df["fare_amount"],dtype=torch.float).reshape(-1,1)
y.shape

torch.Size([120000, 1])

In [24]:
cats.shape,conts.shape,y.shape

(torch.Size([120000, 3]), torch.Size([120000, 6]), torch.Size([120000, 1]))

In [25]:
cats[:3],conts[:3],y[:3]

(tensor([[ 4,  0,  1],
         [11,  0,  2],
         [ 7,  0,  2]]),
 tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
         [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
         [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268]]),
 tensor([[ 6.5000],
         [ 6.9000],
         [10.1000]]))

# Embedding the categorical data with tensor

Docstring of nn.Embedding module

nn.Embedding( <br>
&nbsp;&nbsp;&nbsp;&nbsp;num_embeddings: int,<br>
&nbsp;&nbsp;&nbsp;&nbsp;embedding_dim: int,<br>
&nbsp;&nbsp;&nbsp;&nbsp;padding_idx: Union[int, NoneType] = None,
&nbsp;&nbsp;&nbsp;&nbsp;max_norm: Union[float, NoneType] = None,
&nbsp;&nbsp;&nbsp;&nbsp;norm_type: float = 2.0,
&nbsp;&nbsp;&nbsp;&nbsp;scale_grad_by_freq: bool = False,
&nbsp;&nbsp;&nbsp;&nbsp;sparse: bool = False,
&nbsp;&nbsp;&nbsp;&nbsp;_weight: Union[torch.Tensor, NoneType] = None,
) -> None

Docstring:     
A simple lookup table that stores embeddings of a fixed dictionary and size.

This module is often used to store word embeddings and retrieve them using indices.
The input to the module is a list of indices, and the output is the corresponding
word embeddings.

In [26]:
e=nn.Embedding(24,12)
e(torch.tensor([0,1,3,5,2,23]))# more than 24 will give error as it will be out of index

tensor([[-0.8869,  1.0431, -0.3532,  1.3048, -0.6177,  0.5759,  0.8930, -0.3477,
          1.9138,  1.2875,  0.1557, -1.4329],
        [-0.7795, -0.8482, -1.1163,  0.0174, -1.8208,  0.9388, -0.4445,  0.4060,
          0.2779,  0.4974,  0.7186, -0.4680],
        [-0.4583, -0.2167, -0.2391,  1.3871,  0.7622, -0.6766, -0.8075,  0.6781,
          0.6786, -1.7861, -0.9126, -1.0419],
        [ 0.7706,  2.0860, -0.2332, -0.5657, -0.4858, -0.9111, -0.2755,  2.3138,
         -2.0265, -1.5767,  0.3435, -2.4025],
        [ 0.0271, -0.2511,  1.2960,  2.7405,  2.2799,  0.7166, -0.0561, -0.1545,
         -1.6475,  0.0965,  0.1379,  0.0145],
        [-1.1074,  0.2608, -1.2838, -0.0524, -0.6001,  0.4290, -1.1779, -2.0437,
          0.4106,  2.0010, -1.2907,  1.0392]], grad_fn=<EmbeddingBackward>)

now we need to define such embeddings and their dimesions for each of our categorical data

In [27]:
# the dimension size should not be more than 50 (ideally) ie 2^50 amount of information
print(df.columns)
print(df["Weekday"].cat.categories)

#get the number of embedding for each category
cat_size= [len(df[col].cat.categories) for col in categorical_cols]
cat_size

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance', 'nyc_datetime', 'Hour', 'AMorPM',
       'Weekday'],
      dtype='object')
Index(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype='object')


[24, 2, 7]

In [28]:
#get number of dimension for each category
emb_size = [(i , min(50 ,(i+1)//2)) for i in cat_size]
emb_size

[(24, 12), (2, 1), (7, 4)]

### example of how it is going to work inside model

In [29]:
selfembeds = nn.ModuleList([nn.Embedding(number_of_embedding,number_of_dimension) for \
                          number_of_embedding,number_of_dimension in emb_size])
selfembeds

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [30]:
list(enumerate(selfembeds)),selfembeds[0],selfembeds[0](torch.tensor([3]))

([(0, Embedding(24, 12)), (1, Embedding(2, 1)), (2, Embedding(7, 4))],
 Embedding(24, 12),
 tensor([[-0.1680, -1.7251, -0.3140,  1.0509, -1.4693, -0.2153,  1.7432,  0.5314,
          -0.0449, -0.4639,  0.8726,  0.0976]], grad_fn=<EmbeddingBackward>))

In [31]:
ct = cats[:1]
print(ct)
embeddingz=[]
for i ,embd in enumerate(selfembeds):
    embs = embd(ct[:,i])
    embeddingz.append(embs)
    print(i,ct[:,i],embs)

tensor([[4, 0, 1]])
0 tensor([4]) tensor([[-0.3626, -0.2221,  0.6802,  0.5701, -0.5164,  0.5442, -0.4835,  0.7680,
         -1.1295,  0.1069,  3.0723, -0.5322]], grad_fn=<EmbeddingBackward>)
1 tensor([0]) tensor([[-1.6872]], grad_fn=<EmbeddingBackward>)
2 tensor([1]) tensor([[-1.4180,  0.9825, -2.8505, -0.9005]], grad_fn=<EmbeddingBackward>)


now concatenate the different embeddings dimensions (12,1,4) into one (17)

In [32]:
z = torch.cat(embeddingz,axis=1)
z,z.shape

(tensor([[-0.3626, -0.2221,  0.6802,  0.5701, -0.5164,  0.5442, -0.4835,  0.7680,
          -1.1295,  0.1069,  3.0723, -0.5322, -1.6872, -1.4180,  0.9825, -2.8505,
          -0.9005]], grad_fn=<CatBackward>),
 torch.Size([1, 17]))

### set the drop outs



Init signature: nn.Dropout(p: float = 0.5, inplace: bool = False) -> None <br>
Docstring:     <br>
During training, randomly zeroes some of the elements of the input
tensor with probability :attr:`p` using samples from a Bernoulli
distribution. Each channel will be zeroed out independently on every forward
call.

This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
`Improving neural networks by preventing co-adaptation of feature
detectors`_ .

In [33]:
dropouts =nn.Dropout(.4)

In [34]:
z,dropouts(z)

(tensor([[-0.3626, -0.2221,  0.6802,  0.5701, -0.5164,  0.5442, -0.4835,  0.7680,
          -1.1295,  0.1069,  3.0723, -0.5322, -1.6872, -1.4180,  0.9825, -2.8505,
          -0.9005]], grad_fn=<CatBackward>),
 tensor([[-0.0000, -0.0000,  0.0000,  0.9501, -0.8606,  0.9070, -0.0000,  1.2801,
          -1.8825,  0.1781,  5.1206, -0.8870, -2.8119, -2.3633,  0.0000, -0.0000,
          -0.0000]], grad_fn=<MulBackward0>))

In [35]:
nn.BatchNorm1d

torch.nn.modules.batchnorm.BatchNorm1d

nn.ReLU
nn.BatchNorm1d

$y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$


$\gamma$ and $\beta$ are learnable parameter vectors
of size `C` (where `C` is the input size)

# Define model

In [36]:
class TabModel(nn.Module):
    super().__init__(self, emb_size, continuous_dim, output_sz, layers, p=0.5):
        """
        emb_size: List of tuples -> with unique number of categories and number of dimension
        for each categorical feature
        
        continuous_dim: int -> number of dimesion of the continuous features, all combined
        
        output_sz: int -> number of outputs, size of last layer,
        
        layer: list of int -> list of dimension of each layer, len of layers is the number of layers
        
        p: float -> dropout probability
        """
        #embedding for categorical data in forward pass
        self.embs = nn.ModuleList([nn.Embedding(n_unique_cats, n_dimension) for n_unique_cats, n_dimnesion in emb_size])
        
        #dimension of embeddings of categorical data
        n_cats_emb = np.sum([n_dimension for _,n_dimension in emb_size])
        
        #dropout layer for categorical data
        self.emb_drop = nn.Dropout(p)
        
        #normalization function for continuous variables - use this in forward pass
        self.cont_batchnorm_layer = nn.BatchNorm1d(continuous_dim)
        
        #dimension of input vector-> combined features 
        n_dim = n_cats_emb + continuous_dim
        
        #list with hold the layer in the NN
        layerlist = []
        
        #itererate over the layers list, itj item is the number of neurons in ith layes
        for i in layers:
            layerslist.append(nn.Linear(n_dim,i)) #linear layer
            layerslist.append(nn.ReLU(inplace=True)) #RelU
            layerslist.append(nn.BatchNorm1d(i)) #Batch Noramlization
            layerslist.append(nn.Dropout(p)) #Dropout layer with given p
            n_dim = i #set the input for next layer
    
        layerlist.append(nn.Linear(layers[-1],output_sz))#last layer
        self.layers = nn.Sequential(*layerlist)
            
        
        

SyntaxError: invalid syntax (3582675413.py, line 2)