In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/NYCTaxiFares.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['fare_amount'].describe()

## Feature engineering 

In [None]:
#calculate distance between two gps coordinates

In [None]:
def haversine_distance(df, lat1, long1, lat2, long2):
  r = 6371

  phi1 = np.radians(df[lat1])
  phi2 = np.radians(df[lat2])

  delta_phi = np.radians(df[lat2] - df[lat1])
  delta_lambda = np.radians(df[long2] - df[long1])
  a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
  c = 2* np.arctan2(np.sqrt(a), np.sqrt(1-a))
  d = (r*c)
  return d

In [None]:
df['dist_km'] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Converting string to date-time object
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df.info()

In [None]:
df['pickup_datetime'][0]

In [None]:
#converting time zone accurate data
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours = 4)

In [None]:
df['Hour'] = df['EDTdate'].dt.hour

In [None]:
#AM or PM
df['AMorPM'] = np.where(df['Hour'] < 12, 'am' , 'pm')

In [None]:
#Name of the day
df['weekday'] = df['EDTdate'].dt.strftime("%a")

In [None]:
df.head()

In [None]:
#Seperating categorical and continueous data
cat_cols = ['Hour', 'AMorPM', 'weekday']
cont_cols = ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']

In [None]:
y_col = ['fare_amount']

In [None]:
for cat in cat_cols:
  df[cat] = df[cat].astype('category')

In [None]:
df.dtypes

In [None]:
df['Hour'].head()

In [None]:
df['AMorPM'].head()

In [None]:
df['weekday'].head()

In [None]:
#Categories have codes
#as AM=0 and PM = 1
# 24hr got 24 categories as well as 7 week days got 7 categories
df['weekday'].cat.codes.values #toNumpy array

In [None]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkdy = df['weekday'].cat.codes.values

In [None]:
cats = np.stack([hr,ampm, wkdy], axis = 1)

In [None]:
cats

In [None]:
#Better alternative
#cats = np.stack([df[col].cat.codes.values for col in cat_cols],1)

In [None]:
cats = torch.tensor(cats, dtype=torch.int64)

In [None]:
conts = np.stack([df[col].values for col in cont_cols], axis=1)

In [None]:
conts

In [None]:
conts = torch.FloatTensor(conts)

In [None]:
y = torch.tensor(df[y_col].values, dtype=torch.float)

In [None]:
conts.shape

In [None]:
cats.shape

In [None]:
#Embedding
cat_szs = [len(df[col].cat.categories) for col in cat_cols]

In [None]:
cat_szs

In [None]:
emb_szs = [ (size, min(50,(size+1)//2)) for size in cat_szs]

In [None]:
emb_szs

In [None]:
catz = cats[:2]

In [None]:
catz

In [None]:
selfembeds = nn.ModuleList([nn.Embedding(ni,nf) for ni, nf in emb_szs])

In [None]:
selfembeds

In [None]:
#Forward Method (catz)
embeddingz = []
for i,e in enumerate(selfembeds):
  embeddingz.append(e(catz[:,1]))

In [None]:
embeddingz

In [None]:
z = torch.cat(embeddingz, 1)

In [None]:
z

In [None]:
selfembdrop = nn.Dropout(0.4)

In [None]:
z = selfembdrop(z)

In [None]:
z

In [None]:
class TabularModel(nn.Module):
  def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):

    super().__init__()

    self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni, nf in emb_szs])
    self.emb_drop = nn.Dropout(p)
    self.n_cont = nn.BatchNorm1d(n_cont)

    layerlist = []
    n_emb = sum([nf for ni,nf in emb_szs])
    n_in = n_emb + n_cont

    for i in layers:
      layerlist.append(nn.Linear(n_in,i))
      layerlist.append(nn.ReLU(inplace=True))
      layerlist.append(nn.BatchNorm1d(i))
      layerlist.append(nn.Dropout(p))
      n_in = i

    layerlist.append(nn.Linear(layers[-1],out_sz))
    self.layers = nn.Sequential(*layerlist)

  def forward(self,x_cat, x_cont):
    embeddings = []

    for i,e in enumerate(self.embeds):
      embeddings.append(e(x_cat[:,i]))

    x = torch.cat(embeddings,1)
    x = self.emb_drop(x)

    x_cont = self.n_cont(x_cont)
    x = torch.cat([x,x_cont],1)
    x = self.layers(x)
    return x
    

In [None]:
torch.manual_seed(33)

In [None]:
model = TabularModel(emb_szs, conts.shape[1] , 1 , [200,100], p =0.4)

In [None]:
model

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [None]:
batch_size = 30000
test_size = int(batch_size * 0.2)

In [None]:
#Data already shuffled
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size - test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size - test_size:batch_size]

In [None]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size - test_size:batch_size]

In [None]:
print(len(cat_train))
print(len(con_train))
print(len(cat_test))
print(len(y_train))
print(len(y_test))



In [None]:
import time
start_time = time.time()

epochs = 500

losses = []

for i in range(epochs):
  i+=1

  y_pred = model(cat_train, con_train)
  loss = torch.sqrt(criterion(y_pred, y_train))
  losses.append(loss)

  if i%10==1:
    print(f'epoch {i} loss is {loss}')

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

duration = time.time() - start_time
print(f'Training took {duration/60} mins')

In [None]:
plt.plot(range(epochs), losses)
plt.ylabel('RMSE Loss')
plt.xlabel('epoch');

In [None]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = torch.sqrt(criterion(y_val, y_test))
print(f'RMSE: {loss:.8f}')

In [None]:
print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
count = 0
for i in range(50):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')