In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/Jose Potilla | Pytorch/Data/NYCTaxiFares.csv")

In [4]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


## FEATURE ENGINEERING

In [5]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers

    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])

    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])

    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [6]:
df['dist_km'] = haversine_distance(df, 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude','dropoff_latitude')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,1.925522
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.76537
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.818373
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,2.158661
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,9.457764


In [7]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km'],
      dtype='object')

In [8]:
df['pickup_datetime'].describe()

count                      120000
unique                     120000
top       2010-04-19 08:17:56 UTC
freq                            1
Name: pickup_datetime, dtype: object

Task : We need to Convert this 'pickup_datetime' from Object to Datetime



In [9]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'] )

In [10]:
df['pickup_datetime'].describe()

  df['pickup_datetime'].describe()


count                        120000
unique                       120000
top       2010-04-19 08:17:56+00:00
freq                              1
first     2010-04-11 04:00:10+00:00
last      2010-04-25 03:59:42+00:00
Name: pickup_datetime, dtype: object

In [11]:
# Sample example

my_time = df['pickup_datetime'][0]

In [12]:
my_time

Timestamp('2010-04-19 08:17:56+0000', tz='UTC')

In [13]:
my_time.day_name()

'Monday'

In [14]:
my_time.dayofweek

0

In [15]:
# As Given Date time is in UST, we need to convert it to EST - as the data is from NewYork

df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours = 4)

In [16]:
# Check the Hour

df['Hour'] = df['EDTdate'].dt.hour

In [17]:
# Check AM and PM

df['AMorPM'] = np.where(df['Hour']>12, 'pm', 'am')

In [18]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,1.925522,2010-04-19 04:17:56+00:00,4,am
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.76537,2010-04-17 11:43:53+00:00,11,am
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.818373,2010-04-17 07:23:26+00:00,7,am
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,2.158661,2010-04-11 17:25:03+00:00,17,pm
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,9.457764,2010-04-16 22:19:01+00:00,22,pm


In [19]:
df['Weekday'] = df['EDTdate'].dt.day_name()

In [20]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,1.925522,2010-04-19 04:17:56+00:00,4,am,Monday
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.76537,2010-04-17 11:43:53+00:00,11,am,Saturday
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.818373,2010-04-17 07:23:26+00:00,7,am,Saturday
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,2.158661,2010-04-11 17:25:03+00:00,17,pm,Sunday
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,9.457764,2010-04-16 22:19:01+00:00,22,pm,Friday


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]
 1   fare_amount        120000 non-null  float64            
 2   fare_class         120000 non-null  int64              
 3   pickup_longitude   120000 non-null  float64            
 4   pickup_latitude    120000 non-null  float64            
 5   dropoff_longitude  120000 non-null  float64            
 6   dropoff_latitude   120000 non-null  float64            
 7   passenger_count    120000 non-null  int64              
 8   dist_km            120000 non-null  float64            
 9   EDTdate            120000 non-null  datetime64[ns, UTC]
 10  Hour               120000 non-null  int64              
 11  AMorPM             120000 non-null  object             
 12  Weekday            120000 non-

In [22]:


len(df['Weekday'].unique())

7

In [23]:
columns = df.columns

In [24]:
for i in columns:
  print(f" Lenght of column {i} : {len(df[i].unique())}")

 Lenght of column pickup_datetime : 120000
 Lenght of column fare_amount : 414
 Lenght of column fare_class : 2
 Lenght of column pickup_longitude : 55583
 Lenght of column pickup_latitude : 63302
 Lenght of column dropoff_longitude : 59064
 Lenght of column dropoff_latitude : 67362
 Lenght of column passenger_count : 5
 Lenght of column dist_km : 119999
 Lenght of column EDTdate : 120000
 Lenght of column Hour : 24
 Lenght of column AMorPM : 2
 Lenght of column Weekday : 7


In [25]:
df.columns

Index(['pickup_datetime', 'fare_amount', 'fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [26]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
con_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km', 'EDTdate']

In [27]:
y_col = ['fare_amount']

In [28]:
df[['Hour', 'AMorPM', 'Weekday']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Hour     120000 non-null  int64 
 1   AMorPM   120000 non-null  object
 2   Weekday  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [29]:
# Task ['Hour', 'AMorPM', 'Weekday'] change the Dtype to Category

for col in cat_cols:
  df[col] = df[col].astype('category')

In [30]:
df[['Hour', 'AMorPM', 'Weekday']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   Hour     120000 non-null  category
 1   AMorPM   120000 non-null  category
 2   Weekday  120000 non-null  category
dtypes: category(3)
memory usage: 352.9 KB


In [31]:
df['Hour'].head()

0     4
1    11
2     7
3    17
4    22
Name: Hour, dtype: category
Categories (24, int64): [0, 1, 2, 3, ..., 20, 21, 22, 23]

In [32]:
df['AMorPM'].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): ['am', 'pm']

In [33]:
df['Weekday'].head()

0      Monday
1    Saturday
2    Saturday
3      Sunday
4      Friday
Name: Weekday, dtype: category
Categories (7, object): ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
                         'Wednesday']

In [34]:
df['AMorPM'].cat.categories

Index(['am', 'pm'], dtype='object')

In [35]:
df['AMorPM'] = df['AMorPM'].cat.codes

In [36]:
df['Weekday'] = df['Weekday'].cat.codes

In [37]:
df['Hour'] = df['Hour'].cat.codes

In [38]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,1.925522,2010-04-19 04:17:56+00:00,4,0,1
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.76537,2010-04-17 11:43:53+00:00,11,0,2
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.818373,2010-04-17 07:23:26+00:00,7,0,2
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,2.158661,2010-04-11 17:25:03+00:00,17,1,3
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,9.457764,2010-04-16 22:19:01+00:00,22,1,0


In [39]:
# Find out the Dimension of all Categorical Features - at a time

categorical_dimension = [len(df[i].unique()) for i in df[cat_cols]]
categorical_dimension

[24, 2, 7]

In [40]:
# Find out the Embedding Dimension

embedding_dim = [(x, int(min(50, (x+1)/2))) for x in categorical_dimension]
embedding_dim

[(24, 12), (2, 1), (7, 4)]

In [41]:
embedded_representation = nn.ModuleList([nn.Embedding(inp, out) for inp,out in embedding_dim])
embedded_representation

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [42]:
cat_values = np.stack([df[i].values for i in cat_cols], axis = 1)
cat_values = torch.tensor(cat_values, dtype = torch.int64)
cat_values


tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        ...,
        [14,  1,  3],
        [ 4,  0,  5],
        [12,  0,  2]])

In [43]:
cat_values.dtype

torch.int64

In [53]:
con_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km','EDTdate']

In [54]:
df[['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dist_km', 'EDTdate']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_longitude   120000 non-null  float64            
 1   pickup_latitude    120000 non-null  float64            
 2   dropoff_longitude  120000 non-null  float64            
 3   dropoff_latitude   120000 non-null  float64            
 4   passenger_count    120000 non-null  int64              
 5   dist_km            120000 non-null  float64            
 6   EDTdate            120000 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 6.4 MB


In [62]:
# Convert continuous variables to a tensor
con_values = np.stack([df[i].values for i in con_cols], axis = 1)
con_values = torch.tensor(con_values, dtype = torch.float64)
con_values

TypeError: ignored

In [56]:

pd.set_option('display.max_rows', 500)
embedding_val = []

for i, e in enumerate(embedded_representation):
  embedding_val.append(e(cat_values[:, i]))

In [57]:
z = torch.cat(embedding_val, axis = 1)
z

tensor([[ 1.1732, -0.0157,  0.1775,  ...,  0.0475,  0.5322, -0.4747],
        [-1.6641, -0.9713, -0.1145,  ...,  0.5570, -0.7649, -0.0707],
        [-0.5919, -0.3928, -0.4082,  ...,  0.5570, -0.7649, -0.0707],
        ...,
        [ 0.3242, -1.0560, -0.5091,  ...,  1.8765, -0.6201, -1.6900],
        [ 1.1732, -0.0157,  0.1775,  ..., -0.1266, -0.3033,  1.4203],
        [-1.6318, -0.1479,  0.3576,  ...,  0.5570, -0.7649, -0.0707]],
       grad_fn=<CatBackward0>)

In [58]:
dropout = nn.Dropout(.4)

In [59]:
final_embedded = dropout(z)
final_embedded

tensor([[ 1.9554, -0.0000,  0.2958,  ...,  0.0000,  0.8871, -0.7912],
        [-0.0000, -1.6188, -0.0000,  ...,  0.0000, -0.0000, -0.1179],
        [-0.0000, -0.6546, -0.6804,  ...,  0.9283, -1.2749, -0.1179],
        ...,
        [ 0.5403, -0.0000, -0.0000,  ...,  0.0000, -1.0335, -0.0000],
        [ 1.9554, -0.0261,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
        [-0.0000, -0.2465,  0.5960,  ...,  0.9283, -1.2749, -0.1179]],
       grad_fn=<MulBackward0>)

In [60]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))

        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [61]:
torch.manual_seed(4)
model = TabularModel(embedding_dim, con_values.shape, 1, [200, 100], p=0.4)

TypeError: ignored