In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
import torch.nn as nn

In [None]:
df_train = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv')

In [None]:
df_train.head()

In [None]:
df_train = df_train[:20000]

In [None]:
df_train.shape

In [None]:
def haversine_distance(df, lat1, long1, lat2, long2):
    r = 6371
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2] - df[lat1])
    delta_lambda = np.radians(df[long2] - df[long1])
    
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c)
    
    return d

In [None]:
df_train['distance_kms'] = haversine_distance(df_train, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])

In [None]:
# converting UTC to New York Timezone
df_train['NY_time'] = df_train['pickup_datetime'] - pd.Timedelta(hours=4)

In [None]:
df_train['Hour']= df_train['NY_time'].dt.hour

In [None]:
df_train['AMPM'] = df_train['Hour'].apply(lambda x: 'am' if x <= 12 else 'pm')

In [None]:
df_train['Dayofweek'] = df_train['NY_time'].dt.dayofweek

In [None]:
df_train['Month'] = df_train['NY_time'].dt.month

In [None]:
df_train.head()

In [None]:
cat_cols = ['Hour','AMPM','Dayofweek','Month']
cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'distance_kms']

In [None]:
df_train.dtypes

In [None]:
for cat in cat_cols:
    df_train[cat] = df_train[cat].astype('category')

In [None]:
cats = np.stack([df_train[col].cat.codes.values for col in cat_cols], axis=1)

In [None]:
conts = np.stack([df_train[col].values for col in cont_cols], axis=1)

In [None]:
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float)

In [None]:
conts

In [None]:
y = torch.tensor(df_train['fare_amount'].values, dtype = torch.float).reshape(-1,1)

In [None]:
cat_size = [len(df_train[col].cat.categories) for col in cat_cols]

In [None]:
cat_size

In [None]:
embedding_size = [(size,min(50,(size+1)//2)) for size in cat_size]

In [None]:
embedding_size

In [None]:
class TabularModel(nn.Module):

    def __init__(self, embedding_size, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in embedding_size])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in embedding_size))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(embedding_size, conts.shape[1], 1, [200,100,50,20,10])

In [None]:
model

In [None]:
criterion = nn.MSELoss()  # we'll convert this to RMSE later
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
batch_size = len(df_train)
test_size = int(batch_size * .2)

cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
import time
start_time = time.time()

epochs = 300
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = torch.sqrt(criterion(y_pred, y_train)) # RMSE
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = torch.sqrt(criterion(y_val, y_test))
print(f'RMSE: {loss:.8f}')

In [None]:
print(f'{"PREDICTED":>13} {"ACTUAL":>7} {"DIFF":>7}')
for i in range(50):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

In [None]:
# Make sure to save the model only after the training has happened!
if len(losses) == epochs:
    torch.save(model.state_dict(), 'TaxiFareRegrModel.pt')
else:
    print('Model has not been trained. Consider loading a trained model instead.')

In [None]:
df_test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv')

In [None]:
df_test.head()

In [None]:
df_test['distance_kms'] = haversine_distance(df_test, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])

In [None]:
# converting UTC to New York Timezone
df_test['NY_time'] = df_test['pickup_datetime'] - pd.Timedelta(hours=4)

In [None]:
df_test['Hour']= df_test['NY_time'].dt.hour

In [None]:
df_test['AMPM'] = df_test['Hour'].apply(lambda x: 'am' if x <= 12 else 'pm')

In [None]:
df_test['Dayofweek'] = df_test['NY_time'].dt.dayofweek

In [None]:
df_test['Month'] = df_test['NY_time'].dt.month

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
for cat in cat_cols:
    df_test[cat] = df_test[cat].astype('category')

In [None]:
cats = np.stack([df_test[col].cat.codes.values for col in cat_cols], axis=1)

In [None]:
conts = np.stack([df_test[col].values for col in cont_cols], axis=1)

In [None]:
cats_test = torch.tensor(cats, dtype=torch.int64)
conts_test = torch.tensor(conts, dtype=torch.float)

In [None]:
y_pred = model(cats_test, conts_test)

In [None]:
y_pred

In [None]:
df_test['fare_amount'] = y_pred.detach().numpy()

In [None]:
df_test.head()

In [None]:
df_submission = df_test[['key','fare_amount']]

In [None]:
df_submission.head()