In [1]:
import torch.nn as nn
import torch 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Feature Engineering 

### We want to convert complex values into logical features for our modle

In [2]:
df = pd.read_csv('Data/NYCTaxiFares.csv')

In [3]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [4]:
df['fare_amount'].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

## Haversine Formula - Determine Distance of two Geometric Data Points

In [5]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d



In [6]:
df['dist_km'] = haversine_distance(df,'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')


In [7]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [8]:
#df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])


In [9]:
df['EDTdate'] = pd.to_datetime(df['pickup_datetime'].str[:19]) - pd.Timedelta(hours=4)
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am,Mon
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am,Sat
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am,Sat
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03,17,pm,Sun
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01,22,pm,Fri


In [10]:
"""
Categorical Variables
"""
cat_cols = ['Hour', 'AMorPM', 'Weekday'] # categorical columns
# every hour is a specific category in values from 1-24

cont_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km']
y_col = ['fare_amount'] # this is the regression value


In [11]:
# category data type for converting categorical values into numerical codes (one_hot_encoding)
for cat in cat_cols: 
    df[cat] = df[cat].astype('category') # change the values into category
df.dtypes

pickup_datetime              object
fare_amount                 float64
fare_class                    int64
pickup_longitude            float64
pickup_latitude             float64
dropoff_longitude           float64
dropoff_latitude            float64
passenger_count               int64
dist_km                     float64
EDTdate              datetime64[ns]
Hour                       category
AMorPM                     category
Weekday                    category
dtype: object

In [12]:
df['Weekday'].head()

0    Mon
1    Sat
2    Sat
3    Sun
4    Fri
Name: Weekday, dtype: category
Categories (7, object): ['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed']

In [13]:
# category codes vs categories.
df['AMorPM'].cat.categories
df['Weekday'].cat.categories.values #.values returns the values into numpy arrays. 

# important to note, we need to convert values into numpy arrays to conver
# them into Tensors


array(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype=object)

In [14]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['Weekday'].cat.codes.values

cats = np.stack([hr, ampm, wkdy], 1)

cats[:5]

array([[ 4,  0,  1],
       [11,  0,  2],
       [ 7,  0,  2],
       [17,  1,  3],
       [22,  1,  0]], dtype=int8)

In [15]:
# Convert categorical variables to a tensor
cats = torch.tensor(cats, dtype=torch.int64)
# this syntax is ok, since the source data is an array, not an existing tensor


# for continuous variables, convert the values in the same way
# Convert continuous variables to a tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
conts[:5]

tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        [-73.9905,  40.7564, -73.9712,  40.7482,   1.0000,   1.8641],
        [-73.9910,  40.7342, -73.9060,  40.7431,   1.0000,   7.2313]])

In [16]:
# conver the labels into a Tensor
labels = torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1,1)
labels

tensor([[ 6.5000],
        [ 6.9000],
        [10.1000],
        ...,
        [12.5000],
        [ 4.9000],
        [ 5.3000]])

In [17]:
cats.shape

torch.Size([120000, 3])

In [18]:
conts.shape

torch.Size([120000, 6])

In [19]:
labels.shape

torch.Size([120000, 1])

In [20]:
# setup the embedding signs for the categorical columns. Embedding is like a one_hot_encoding 
# of vocabulary 
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
cat_szs

[24, 2, 7]

In [21]:
# general rule of thumb 

# max out around 50
# divide num unique entries in each column / 2

emb_szs = [(size,min(50, (size+1)//2)) for size in cat_szs]
emb_szs

# tuples are: 
# number of categories and the embedding size

[(24, 12), (2, 1), (7, 4)]

# Model Definition for Tabular Data

### Categorical Values
### Continuous Values

In [22]:
catz = cats[:4]
catz

tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        [17,  1,  3]])

In [23]:
selfembeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
# embedding layers - embedding number and the dimension

# each embedding layer corresponds to a single category in our data. 

In [24]:
selfembeds
# one for ampm
# one for hours
# one for weekdays

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [25]:
# forward method (cats)

embeddingz = []

for i, e in enumerate(selfembeds):
    embeddingz.append(e(catz[:,i]))

In [26]:
embeddingz

[tensor([[ 2.0870e+00,  8.1412e-01,  1.9246e+00,  1.0000e+00,  4.5205e-01,
          -4.2729e-01,  2.3938e-01, -4.5779e-01,  2.1893e-01,  2.4322e-01,
           1.2960e+00, -1.0634e-01],
         [-3.7938e-01,  1.1685e+00,  4.3006e-01,  5.8350e-01, -2.3617e-01,
          -1.4044e-03, -5.0160e-02,  7.4304e-01,  1.1700e-01,  9.2119e-01,
           2.6190e-01,  9.1331e-01],
         [-2.2519e-01, -4.5393e-01,  4.9689e-01,  1.6435e-01,  1.3056e+00,
          -1.3011e-01,  1.4864e+00,  3.9380e-01, -4.3279e-01,  1.2297e-01,
           2.5501e-01, -1.8031e-01],
         [-7.5458e-01,  1.0869e+00,  9.3584e-01, -2.6021e-01, -5.0652e-01,
           4.7013e-01, -2.5099e+00,  2.1694e+00,  1.7101e-01, -6.5003e-03,
           8.0582e-01, -2.8851e-01]], grad_fn=<EmbeddingBackward0>),
 tensor([[1.3928],
         [1.3928],
         [1.3928],
         [0.4002]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.2796, -0.8993,  0.7063,  0.3630],
         [ 0.0320, -0.7722, -1.1410,  2.0380],
         [ 0.0320,

In [27]:
z = torch.cat(embeddingz, 1)

In [28]:
z

tensor([[ 2.0870e+00,  8.1412e-01,  1.9246e+00,  1.0000e+00,  4.5205e-01,
         -4.2729e-01,  2.3938e-01, -4.5779e-01,  2.1893e-01,  2.4322e-01,
          1.2960e+00, -1.0634e-01,  1.3928e+00,  2.7959e-01, -8.9925e-01,
          7.0634e-01,  3.6303e-01],
        [-3.7938e-01,  1.1685e+00,  4.3006e-01,  5.8350e-01, -2.3617e-01,
         -1.4044e-03, -5.0160e-02,  7.4304e-01,  1.1700e-01,  9.2119e-01,
          2.6190e-01,  9.1331e-01,  1.3928e+00,  3.1999e-02, -7.7218e-01,
         -1.1410e+00,  2.0380e+00],
        [-2.2519e-01, -4.5393e-01,  4.9689e-01,  1.6435e-01,  1.3056e+00,
         -1.3011e-01,  1.4864e+00,  3.9380e-01, -4.3279e-01,  1.2297e-01,
          2.5501e-01, -1.8031e-01,  1.3928e+00,  3.1999e-02, -7.7218e-01,
         -1.1410e+00,  2.0380e+00],
        [-7.5458e-01,  1.0869e+00,  9.3584e-01, -2.6021e-01, -5.0652e-01,
          4.7013e-01, -2.5099e+00,  2.1694e+00,  1.7101e-01, -6.5003e-03,
          8.0582e-01, -2.8851e-01,  4.0021e-01,  7.6894e-01, -1.5453e+00,
    

In [29]:
selfembdrop = nn.Dropout(0.4) # dropout switches off nodes by chance
z = selfembdrop(z)
# dropout prob for helping against overfitting 

In [30]:
z

tensor([[ 3.4784e+00,  1.3569e+00,  3.2077e+00,  1.6667e+00,  7.5342e-01,
         -7.1215e-01,  3.9897e-01, -0.0000e+00,  3.6489e-01,  4.0537e-01,
          2.1600e+00, -0.0000e+00,  0.0000e+00,  4.6599e-01, -1.4988e+00,
          0.0000e+00,  6.0505e-01],
        [-6.3229e-01,  1.9474e+00,  7.1677e-01,  0.0000e+00, -3.9361e-01,
         -2.3407e-03, -0.0000e+00,  0.0000e+00,  0.0000e+00,  1.5353e+00,
          4.3651e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00, -0.0000e+00,
         -0.0000e+00,  3.3967e+00],
        [-0.0000e+00, -7.5654e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         -2.1685e-01,  0.0000e+00,  6.5634e-01, -7.2131e-01,  0.0000e+00,
          0.0000e+00, -3.0052e-01,  0.0000e+00,  5.3331e-02, -0.0000e+00,
         -1.9016e+00,  0.0000e+00],
        [-0.0000e+00,  0.0000e+00,  1.5597e+00, -0.0000e+00, -0.0000e+00,
          0.0000e+00, -4.1831e+00,  3.6157e+00,  2.8502e-01, -0.0000e+00,
          0.0000e+00, -4.8085e-01,  0.0000e+00,  1.2816e+00, -0.0000e+00,
    

# Model Class Definition

In [31]:
# class for our model
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [32]:
torch.manual_seed(33)
print(emb_szs)
alpha = 0.001
model = TabularModel(emb_szs, conts.shape[1], 2, [200,100], p=0.4) # out_sz = 2
criterion = nn.MSELoss() # np.sqrt(MSE) -> RMSE
optimizer = torch.optim.Adam(model.parameters(), lr=alpha)

model

[(24, 12), (2, 1), (7, 4)]


TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=2, bias=True)
  )
)

In [33]:
batch_size = 60000
test_size = 12000

cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = labels[:batch_size-test_size]
y_test = labels[batch_size-test_size:batch_size]

In [34]:
import time
start_time = time.time()

epochs = 300
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

  return F.mse_loss(input, target, reduction=self.reduction)


epoch:   1  loss: 158.10052490
epoch:  26  loss: 123.30981445
epoch:  51  loss: 106.23386383
epoch:  76  loss: 96.63500214


KeyboardInterrupt: 