In [None]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/ann_files/NYCTaxiFares.csv')
df.head()

In [None]:
df['fare_amount'].describe()

In [None]:
# because the longitude and latitude values change slightly through the travels, it is better to use kind of distance instead
# of them (this process is called feature engineering)
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [None]:
df['dist_km'] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

In [None]:
# let's look at the pickup_datetime:
df.info()

In [None]:
# datetime is a non-sense feature currently for the network and we should do some feature engineering on that too !!!
# first: change the currently defined pickup_datetime which is string to datetime (to become more understandable)
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df.info()

In [None]:
# second: change the UTC time to real New York timing which is Eastern Datetime
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours=4)

In [None]:
# third: extract some categorical data
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12, 'am', 'pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")
df.head()

In [None]:
# now all features are prepared; some are categorical and some are continuous: let's handle them:
df.columns

In [None]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km']
y_col = ['fare_amount']

In [None]:
df.dtypes

In [None]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [None]:
df.dtypes

In [None]:
df['Hour']

In [None]:
df['AMorPM']

In [None]:
df['Weekday']

In [None]:
df['AMorPM'].cat.categories

In [None]:
df['AMorPM'].cat.codes

In [None]:
df['Weekday'].cat.categories

In [None]:
df['Weekday'].cat.codes

In [None]:
hr = df['Hour'].cat.codes.values
am_pm = df['AMorPM'].cat.codes.values
week_day = df['Weekday'].cat.codes.values

In [None]:
cats = np.stack([hr, am_pm, week_day], axis=1)

In [None]:
conts = np.stack([df[col].values for col in cont_cols], axis=1)
# here we used a one-line code structure; we could use this for 'cats' variable too, but we used separate lines to be more understandable!

In [None]:
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float)
y = torch.tensor(df[y_col].values, dtype=torch.float)

In [None]:
# now try to define the embedding specifications for the categorical data
# The rule of thumb for determining the embedding size is to divide the number of unique entries in each column by 2, but not to exceed 50.
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

In [None]:
########## the following lines would just illustrate that how does embedding work

In [None]:
sub_cat = cats[:2]
sub_cat

In [None]:
self_embeds = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for num_embeddings, embedding_dim in emb_szs])
self_embeds

In [None]:
embeddingz = []
for i, e in enumerate(self_embeds):
    embeddingz.append(e(sub_cat[:,i]))

In [None]:
embeddingz

In [None]:
concated_embeddings = torch.cat(embeddingz, axis=1)
concated_embeddings

In [None]:
self_drops = nn.Dropout(0.4)

In [None]:
concated_embeddings_dropout = self_drops(concated_embeddings)
concated_embeddings_dropout

In [None]:
##########

In [None]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for num_embeddings, embedding_dim in emb_szs])
        self.embed_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerslist = []
        num_embeds = sum([embedding_dim for num_embeddings, embedding_dim in emb_szs])
        num_in = num_embeds + n_cont
        
        for i in layers:
            layerslist.append(nn.Linear(num_in, i))
            layerslist.append(nn.ReLU(inplace=True))
            layerslist.append(nn.BatchNorm1d(i))
            layerslist.append(nn.Dropout(p))
            num_in = i
            
        layerslist.append(nn.Linear(layers[-1],out_sz))
        
        self.layers = nn.Sequential(*layerslist)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x_cat = torch.cat(embeddings, axis=1)
        x_cat = self.embed_drop(x_cat)
        
        x_cont = self.bn_cont(x_cont)
        
        x = torch.cat([x_cat, x_cont], axis=1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(emb_szs, conts.shape[1], 1, [200, 100], p=0.4)
model

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# here we define a sort of nominal batch size to just cut in half the training time. Pay attention that batch size here is just
# nominal and is not the technical usage of batch size which leads to batch gradient descent
batch_size = 60000
test_size = int(batch_size*0.2)

In [None]:
# DATA IS SHUFFLED ALREADY
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
cont_train = conts[:batch_size-test_size]
cont_test = conts[batch_size-test_size:batch_size]

In [None]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
import time
start_time = time.time()
epochs = 300
losses = []

for i in range(epochs):
    i += 1
    y_pred = model(cat_train, cont_train)
    loss = torch.sqrt(criterion(y_pred, y_train))     # this means RMSE
    losses.append(loss)
    
    if i%10 ==1:
        print(f'epoch: {i}, loss is {loss}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
duration = time.time() - start_time
print(f'training took {duration/60} minutes')   

In [None]:
plt.plot(range(epochs), losses)

In [None]:
with torch.no_grad():
    y_val = model(cat_test, cont_test)
    loss = torch.sqrt(criterion(y_val, y_test))
loss

In [None]:
for i in range(10):
    diff = np.abs(y_val[i].item() - y_test[i].item())
    print(f'{i}) predicted: {y_val[i].item(): 8.2f},      true: {y_test[i].item(): 8.2f}    diff: {diff: 8.2f}')

In [None]:
torch.save(model.state_dict(), '../models/my_taxi_model_1.pt')