In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [None]:
df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows=999999)

In [None]:
df

In [None]:
df['fare_amount'].describe()

In [None]:
# Haversine formula: determining distance between 2 points given their latitude and longtitude

In [None]:
from numpy import radians, cos, sin, arcsin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1.values)
    lat1 = np.radians(lat1.values)
    lon2 = np.radians(lon2.values)
    lat2 = np.radians(lat2.values)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    c = np.multiply(2, np.arcsin(np.sqrt(a)))
    r = 6371

    return c*r
def distance(s_lat, s_lng, e_lat, e_lng):

   # approximate radius of earth in km
   R = 6373.0

   s_lat = s_lat*np.pi/180.0                      
   s_lng = np.deg2rad(s_lng)     
   e_lat = np.deg2rad(e_lat)                       
   e_lng = np.deg2rad(e_lng)  

   d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2

   return 2 * R * np.arcsin(np.sqrt(d))

# from haversine import haversine


In [None]:
df['dist_kmm'] = haversine(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])
df['dist_km'] = distance(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(16,10))
    axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    axs[0].imshow(nyc_map, zorder=0, extent=BB)

    axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    axs[1].imshow(nyc_map, zorder=0, extent=BB)
BB = (-74.5, -72.8, 40.5, 41.8)
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')
plot_on_map(df, BB, nyc_map, s=1, alpha=0.3)

In [None]:
# convert datetime string to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df.info()

In [None]:
one_time = df['pickup_datetime'][0]

In [None]:
one_time.hour

In [None]:
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours=4)

In [None]:
df['Hour'] = df['EDTdate'].dt.hour

In [None]:
df['AMPM'] = np.where(df['Hour']<12, 'am', 'pm')

In [None]:
df.head()

In [None]:
df['Weekday'] = df['EDTdate'].dt.strftime("%a")
df['DoW'] = df['EDTdate'].dt.dayofweek

In [None]:
df.head()

In [None]:
df.columns

In [None]:
cat_cols = ['Hour', 'AMPM', 'Weekday', 'DoW']
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km']

In [None]:
y_col = ['fare_amount']

In [None]:
# Categorical to numeric

In [None]:
df.dtypes

In [None]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [None]:
df.dtypes

In [None]:
df['Weekday']

In [None]:
df['AMPM'].cat.categories

In [None]:
df['AMPM'].cat.codes.values

In [None]:
hr = df['Hour'].cat.codes.values
ampm = df['AMPM'].cat.codes.values
wd = df['Weekday'].cat.codes.values
dw = df['DoW'].cat.codes.values

In [None]:
dw

In [None]:
cats = np.stack([hr, ampm, wd, dw], axis=1)

In [None]:
cats

In [None]:
# category numpy to sensor
cats = torch.tensor(cats, dtype=torch.int64)

In [None]:
# continous to tensor
conts = np.stack([df[col].values for col in cont_cols], axis=1)
conts = torch.tensor(conts, dtype=torch.float)
conts

In [None]:
# label to tensor
y = torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1, 1)

In [None]:
cats.shape

In [None]:
conts.shape

In [None]:
y.shape

In [None]:
cats_size = [len(df[col].cat.categories) for col in cat_cols]

In [None]:
cats_size

In [None]:
embedding_size = [(size, min(50, (size+1)//2)) for size in cats_size]

In [None]:
embedding_size

In [None]:
# Tabular model - using embedding layer

In [None]:
catz = cats[:2]

In [None]:
catz

In [None]:
selfembeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])

In [None]:
selfembeds

In [None]:
# forward
embedding_z = []

for i, e in enumerate(selfembeds):
    embedding_z.append(e(catz[:,i]))

In [None]:
embedding_z

In [None]:
z = torch.cat(embedding_z, 1)

In [None]:
z

In [None]:
selfembeddingdrop = nn.Dropout(0.4)

In [None]:
z = selfembeddingdrop(z)

In [None]:
z

In [None]:
class TabularModel(nn.Module):
#     u can define the number of layers in this manner of build - flexibility
    def __init__(self, emb_size, n_cont, out_size, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layer_list = []
        n_emb = sum([nf for ni, nf in emb_size])
        n_in = n_emb + n_cont
        
        for i  in layers:
            layer_list.append(nn.Linear(n_in, i))
            layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.BatchNorm1d(i))
            layer_list.append(nn.Dropout(p))
            n_in = i
            
        layer_list.append(nn.Linear(layers[-1], out_size))
        self.layers = nn.Sequential(*layer_list)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(embedding_size, conts.shape[1], 1, [200, 100], p=0.4)
# for classification problem, use class size 2 instead of 1

In [None]:
model

In [None]:
criterion = nn.MSELoss()
# for classification problem: use nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# train_test_split
batch_size = 60000
test_size = int(batch_size*0.2)

In [None]:
# shuffled 
cat_train = cats[:batch_size-test_size] 
cat_test = cats[batch_size - test_size:batch_size]

con_train = conts[:batch_size-test_size]
con_test = conts[batch_size - test_size:batch_size]

In [None]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size - test_size:batch_size]

In [None]:
len(cat_train)

In [None]:
len(con_train)

In [None]:
len(y_train)

In [None]:
len(cat_test)

In [None]:
import time
start_time = time.time()

epochs = 200

losses = []

for i in range(epochs):
    i+=1
    
    y_pred = model(cat_train, con_train)
    loss = torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss)
    if i%25 == 1:
        print(f"epoch:{i} loss: {loss}")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
duration = time.time() - start_time
print(f"training time: {duration/60}min")

In [None]:
plt.plot(losses)

In [None]:
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = torch.sqrt(criterion(y_val,y_test))

In [None]:
loss

In [None]:
for i in range(10):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f"{i}predicted {y_val[i].item():8.2f} True:{y_test[i].item():8.2f} DIFF: {diff:8.2f}")

In [None]:
torch.save(model.state_dict(), 'taxi_model_kaggle_pytorch.pt')

In [None]:
# Loading saved model 
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

def haversine_distance(df, lat1, long1, lat2, long2):
    r = 6371
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return r * c

class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
        self.layers = nn.Sequential(*layerlist)
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        return self.layers(x)

In [None]:
emb_szs = [(24, 12), (2, 1), (7, 4)]
model2 = TabularModel(emb_szs, 6, 1, [200,100], p=0.4)

In [None]:
def test_data(mdl): # pass in the name of the new model
    # INPUT NEW DATA
    plat = float(input('What is the pickup latitude?  '))
    plong = float(input('What is the pickup longitude? '))
    dlat = float(input('What is the dropoff latitude?  '))
    dlong = float(input('What is the dropoff longitude? '))
    psngr = int(input('How many passengers? '))
    dt = input('What is the pickup date and time?\nFormat as YYYY-MM-DD HH:MM:SS     ')
    
    # PREPROCESS THE DATA
    dfx_dict = {'pickup_latitude':plat,'pickup_longitude':plong,'dropoff_latitude':dlat,
         'dropoff_longitude':dlong,'passenger_count':psngr,'EDTdate':dt}
    dfx = pd.DataFrame(dfx_dict, index=[0])
    dfx['dist_km'] = haversine_distance(dfx,'pickup_latitude', 'pickup_longitude',
                                        'dropoff_latitude', 'dropoff_longitude')
    dfx['EDTdate'] = pd.to_datetime(dfx['EDTdate'])
    
    # We can skip the .astype(category) step since our fields are small,
    # and encode them right away
    dfx['Hour'] = dfx['EDTdate'].dt.hour
    dfx['AMorPM'] = np.where(dfx['Hour']<12,0,1) 
    dfx['Weekday'] = dfx['EDTdate'].dt.strftime("%a")
    dfx['Weekday'] = dfx['Weekday'].replace(['Fri','Mon','Sat','Sun','Thu','Tue','Wed'],
                                            [0,1,2,3,4,5,6]).astype('int64')
    # CREATE CAT AND CONT TENSORS
    cat_cols = ['Hour', 'AMorPM', 'Weekday']
    cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
                 'dropoff_longitude', 'passenger_count', 'dist_km']
    xcats = np.stack([dfx[col].values for col in cat_cols], 1)
    xcats = torch.tensor(xcats, dtype=torch.int64)
    xconts = np.stack([dfx[col].values for col in cont_cols], 1)
    xconts = torch.tensor(xconts, dtype=torch.float)
    
    # PASS NEW DATA THROUGH THE MODEL WITHOUT PERFORMING A BACKPROP
    with torch.no_grad():
        z = mdl(xcats, xconts)
    print(f'\nThe predicted fare amount is ${z.item():.2f}')

In [None]:
z = test_data(model)