In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import numpy as np
import pandas as pd
import datetime as dt
import time
import math
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Data Loading

In [None]:
df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv',nrows=300000)
df.head()

# Shuffling

In [None]:
df=df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df.isna().sum()
df=df.dropna()
    

In [None]:
df.isna().sum()

In [None]:
#Time is in UTC which is 4 hrs ahead of US Time
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime']) - pd.Timedelta(hours=4)
df['Hour']=df['pickup_datetime'].dt.hour
df['Weekday']=df['pickup_datetime'].dt.strftime("%a")

df.head()

In [None]:
def distance(df,lat,long,lat1,long1):
    radius=6371
    latradians=np.radians(df[lat])
    latradians1=np.radians(df[lat1])
    diflatradians=np.radians(df[lat1]-df[lat])
    diflongradians=np.radians(df[long1]-df[long])
    a=np.sin(diflatradians/2)**2 + np.cos(latradians)* np.sin(diflongradians/2)**2 * np.cos(latradians1)
    distance = radius * (2*np.arctan2(np.sqrt(a),np.sqrt(1-a)))
    return distance

# Distance Between Coordinates

In [None]:
df['distance']=distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [None]:
df['distance'].describe()

In [None]:
df['Am-Pm']='am'
df['Am-Pm'][df['Hour']>=12]='pm'
df.head()

# Data Understanding

In [None]:
df1=df[(df['fare_amount']>=0)  ]
df=df[(df['fare_amount']>=0)  ]
df1.shape
# Some instances where fare_amount is less than 0

In [None]:
plt.hist(df['fare_amount'],bins=20)
plt.show()


In [None]:
df['fare_amount'].describe()

In [None]:
col=['Hour','Weekday','Am-Pm','fare_amount']
col1=col[:-1]
df1=df1[col]
df1.head()
print(col1)

# Categorical Variables Identified

In [None]:
for i in col1:
    ls1=[]
    ls2=[]
    for j in sorted(df1[i].unique()):
        a=df1[df1[i]==j]["fare_amount"].mean(axis=0)
        ls1.append(round(a,2))
        ls2.append(j)
    ls=pd.DataFrame(data = ls1, 
                  index = ls2, 
                  columns = [i+"_avg_fare_amount"])
    ls.plot()
    print(ls)

In [None]:
cat_col=["Hour","Weekday","Am-Pm"]
con_col=['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance','passenger_count' ]
df_cat=df[cat_col]
df_con=df[con_col]
y_col = ['fare_amount']

In [None]:
#to convert the type to category
for cat in df_cat.columns:
    df_cat[cat] = df_cat[cat].astype('category')

df_cat.info()

In [None]:
cat=np.stack([df_cat[i].cat.codes.values for i in df_cat.columns ],1)
cat[:5]
cat = torch.tensor(cat, dtype=torch.int64)

In [None]:
con=np.stack([df_con[i].values for i in df_con.columns],1)
con[:5]
con = torch.Tensor(con)
print(con.dtype)

In [None]:
y=torch.Tensor(np.stack([df[y_col].values],1)).reshape(-1,1)
print(y.dtype)


In [None]:
#Embeddings for Categorical Variables
embcat=[(df_cat[i].nunique(),(df_cat[i].nunique()+1)//2) for i in df_cat.columns]
print(embcat)

# NN Model with Flexible Layers

In [None]:
#Dropout is used so as to to reduce over fitting
class Tabular1(nn.Module):
    def __init__(self,con_n,out_sz,layers,embcat,p=0.5):
        super().__init__()
        layerlist=[]
        self.drop=nn.Dropout(p)
        self.emb=nn.ModuleList([nn.Embedding(i,o) for (i,o) in embcat])
        self.cont=nn.BatchNorm1d(con_n)
        total_cat=sum([o for (i,o) in embcat])
        totaln=total_cat+con_n
        for i in layers:
            layerlist.append(nn.Linear(totaln,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            totaln=i
        layerlist.append(nn.Linear(layers[-1],out_sz))
        self.layers=nn.Sequential(*layerlist)
    def forward(self,xcat,xcon):
        embe=[]
        for i, e in enumerate(self.emb):
            embe.append(e(xcat[:,i]))
        xcat=torch.cat(embe,1)
        xcat=self.drop(xcat)
        xcon=self.cont(xcon)
        x=torch.cat([xcat,xcon],1)
        x=self.layers(x)
        return x

In [None]:
model=Tabular1(con.shape[1],1,[200,100],embcat,0.4)

In [None]:
model

In [None]:
#number of parameters
total=0
for param in model.parameters():
    print(param.numel())
    total+=param.numel()
print(f'Total: {total}')    

In [None]:
criteria=nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
#to get batches
cat_con=torch.cat([cat,con],1)
print(cat_con.shape)
train_size=int(cat_con.shape[0]*0.8)
cat_con_tr=cat_con[:train_size]
cat_con_te=cat_con[train_size:]
X_cat_tr=cat_con_tr[:,:len(df_cat.columns)]
X_cat_tr=X_cat_tr.type(torch.long)
X_cat_te=cat_con_te[:,:len(df_cat.columns)]
X_con_tr=cat_con_tr[:,len(df_cat.columns):]
X_cat_te=X_cat_te.type(torch.long)
X_con_te=cat_con_te[:,len(df_cat.columns):]
ytr=y[:train_size]
yte=y[train_size:]
print(cat_con_te.shape)
print(X_cat_tr.dtype)
print(ytr.shape)
print(X_cat_tr.shape)

# Creating Batches

In [None]:
train=TensorDataset(cat_con_tr,ytr)
print(train[0])
#creating batches for training 
train_data=DataLoader(train,batch_size=20000,shuffle=True)

# Training

In [None]:
import time
start=time.time()
epochs=50
losses1=[]
val_losses=[]
for i in range (epochs):
    i=i+1
    for b,(X_train,y_train) in enumerate(train_data):
        X_train_cat=X_train[:,:len(df_cat.columns)]
        X_train_cat = X_train_cat.type(torch.LongTensor)
        X_train_con=X_train[:,len(df_cat.columns):]
        y_pred = model(X_train_cat, X_train_con)
        loss = torch.sqrt(criteria(y_pred, y_train))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    y_pred = model(X_cat_tr,X_con_tr)
    loss = torch.sqrt(criteria(y_pred, ytr))
    if i%10==0:
        print(f'Epoch:{i} Loss:{loss.item():10.4f}')
    losses1.append(loss.item()) 
    with torch.no_grad():
        y_val = model(X_cat_te,X_con_te)
        loss = torch.sqrt(criteria(y_val,yte))
        val_losses.append(loss.item()) 
print(f'Time in mins: {((time.time()-start)/60):10.4f}')        

In [None]:
plt.plot(losses1, label='training loss')
plt.plot(val_losses, label='validation loss')
plt.title('Loss at the end of each epoch')
plt.legend();

# Output of NN

In [None]:
with torch.no_grad():
    y_val = model(X_cat_te,X_con_te)
    loss = torch.sqrt(criteria(y_val,yte))
print(f'RMSE: {loss:.8f}')

Training for more Epochs and for more Layers can be looked into. Feel free to make changes to the code and get better models.
Please leave a like if you learnt something. Would motivate me alot.

In [None]:
for i in range (0,100):
    print (f'Predicted: {y_val[i].item():6.2f}   Actual: {yte[i].item():6.2f}   Difference: {abs(y_val[i].item()-yte[i].item()):6.2f}')