In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

print("""
# #################################################################################################
#  Full ANN Code Along - Regression Part One(1)
# #################################################################################################
""")


# #################################################################################################
#  Full ANN Code Along - Regression Part One(1)
# #################################################################################################



In [4]:
df = pd.read_csv('../../../notebooks/Data/NYCTaxiFares.csv')
print(df.head())
print(df['fare_amount'].describe())

           pickup_datetime  fare_amount  fare_class  pickup_longitude  \
0  2010-04-19 08:17:56 UTC          6.5           0        -73.992365   
1  2010-04-17 15:43:53 UTC          6.9           0        -73.990078   
2  2010-04-17 11:23:26 UTC         10.1           1        -73.994149   
3  2010-04-11 21:25:03 UTC          8.9           0        -73.990485   
4  2010-04-17 02:19:01 UTC         19.7           1        -73.990976   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0        40.730521         -73.975499         40.744746                1  
1        40.740558         -73.974232         40.744114                1  
2        40.751118         -73.960064         40.766235                2  
3        40.756422         -73.971205         40.748192                1  
4        40.734202         -73.905956         40.743115                1  
count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700

In [5]:
def haversine_distance(df, src_lat, src_long, tar_lat, tar_long):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers

    phi1 = np.radians(df[src_lat])
    phi2 = np.radians(df[tar_lat])

    delta_phi = phi2 - phi1
    delta_lambda = np.radians(df[tar_long] - df[src_long])

    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    d = (r * c)  # in kilometers
    return d

In [6]:
# FEATURE ENGINEERING - Taking a existing features already have, create new more useful features
# than original features.

# Engineering distance between two points with co-ordinates.
df['dist_km'] = haversine_distance(df, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
                                   'dropoff_longitude')
print(df['dist_km'].head())

print(df.info())

0    2.126312
1    1.392307
2    3.326763
3    1.864129
4    7.231321
Name: dist_km, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB
None


In [7]:
# Engineering datetime object(string) to be DateTime object.
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]
 1   fare_amount        120000 non-null  float64            
 2   fare_class         120000 non-null  int64              
 3   pickup_longitude   120000 non-null  float64            
 4   pickup_latitude    120000 non-null  float64            
 5   dropoff_longitude  120000 non-null  float64            
 6   dropoff_latitude   120000 non-null  float64            
 7   passenger_count    120000 non-null  int64              
 8   dist_km            120000 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 8.2 MB
None
            pickup_datetime  fare_amount  fare_class  pickup_longitude  \
0 2010-04-19 08:17:56+00:00          6.5           0        -73.99

In [9]:
my_time = df['pickup_datetime'][0]
print(my_time)
print(my_time.hour)

2010-04-19 08:17:56+00:00
8


In [39]:
df['EDTDate'] = df['pickup_datetime'].dt.tz_convert('US/Eastern')
df['Hour'] = df['EDTDate'].dt.hour
df['AMorPM'] = np.where(df['Hour'] >= 12, 'pm', 'am')
df['Weekday'] = df['EDTDate'].dt.strftime('%a')

print(df.head())

            pickup_datetime  fare_amount  fare_class  pickup_longitude  \
0 2010-04-19 08:17:56+00:00          6.5           0        -73.992365   
1 2010-04-17 15:43:53+00:00          6.9           0        -73.990078   
2 2010-04-17 11:23:26+00:00         10.1           1        -73.994149   
3 2010-04-11 21:25:03+00:00          8.9           0        -73.990485   
4 2010-04-17 02:19:01+00:00         19.7           1        -73.990976   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  \
0        40.730521         -73.975499         40.744746                1   
1        40.740558         -73.974232         40.744114                1   
2        40.751118         -73.960064         40.766235                2   
3        40.756422         -73.971205         40.748192                1   
4        40.734202         -73.905956         40.743115                1   

    dist_km                   EDTDate  Hour AMorPM Weekday  
0  2.126312 2010-04-19 04:17:56-04:00

In [40]:
print("""
# #################################################################################################
#  Full ANN Code Along - Regression Part Two(2) (Categorical and Continuous Features)
# #################################################################################################
""")


# #################################################################################################
#  Full ANN Code Along - Regression Part Two(2) (Categorical and Continuous Features)
# #################################################################################################



In [44]:
categorical_cols = ['Hour', 'AMorPM', 'Weekday']
continuous_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                   'dropoff_latitude', 'passenger_count', 'dist_km']
y_col = ['fare_amount']     # Hence regression problem.

In [52]:
# Change the categorical columns's data type('dtype') to 'category' type, so neural network can understand
# assigning a number.
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [53]:
df['Hour'].head()

0     4
1    11
2     7
3    17
4    22
Name: Hour, dtype: category
Categories (24, int64): [0, 1, 2, 3, ..., 20, 21, 22, 23]

In [54]:
df['AMorPM'].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): [am, pm]

In [55]:
df['Weekday'].head()

0    Mon
1    Sat
2    Sat
3    Sun
4    Fri
Name: Weekday, dtype: category
Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype                     
---  ------             --------------   -----                     
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]       
 1   fare_amount        120000 non-null  float64                   
 2   fare_class         120000 non-null  int64                     
 3   pickup_longitude   120000 non-null  float64                   
 4   pickup_latitude    120000 non-null  float64                   
 5   dropoff_longitude  120000 non-null  float64                   
 6   dropoff_latitude   120000 non-null  float64                   
 7   passenger_count    120000 non-null  int64                     
 8   dist_km            120000 non-null  float64                   
 9   EDTDate            120000 non-null  datetime64[ns, US/Eastern]
 10  Hour               120000 non-null  category                  
 11  

In [69]:
# We can access 'category' type object details using '.cat' as in '.dt' for 'DateTime' objects.
print(df['AMorPM'].cat.categories)
print(df['AMorPM'].cat.codes)

vals = df['AMorPM'].cat.codes.values
print(type(vals))
print(vals)

Index(['am', 'pm'], dtype='object')
0         0
1         0
2         0
3         1
4         1
         ..
119995    0
119996    0
119997    1
119998    0
119999    1
Length: 120000, dtype: int8
<class 'numpy.ndarray'>
[0 0 0 ... 1 0 1]


In [72]:
hr = df['Hour'].cat.codes.values
ampm = df['AMorPM'].cat.codes.values
wkd = df['Weekday'].cat.codes.values

cat = np.stack([hr, ampm, wkd], axis=1) # Join each array as a column thus, axis=1.
print(cat)

[[ 4  0  1]
 [11  0  2]
 [ 7  0  2]
 ...
 [14  1  3]
 [ 4  0  5]
 [12  1  2]]


In [75]:
# Use 'list' comprehension to derive values and stack them up to single numpy array.
cat = np.stack([df[col].cat.codes.values for col in categorical_cols], axis=1)
print(cat)

# OR skip change data type for loop and embed it into the list comprehension as well.

cat = np.stack([df[col].astype('category').cat.codes.values for col in categorical_cols], axis=1)
print(cat)

[[ 4  0  1]
 [11  0  2]
 [ 7  0  2]
 ...
 [14  1  3]
 [ 4  0  5]
 [12  1  2]]
[[ 4  0  1]
 [11  0  2]
 [ 7  0  2]
 ...
 [14  1  3]
 [ 4  0  5]
 [12  1  2]]


In [82]:
# Convert numpy array to 'tensor'.
cat = torch.tensor(cat, dtype=torch.int64)
print(cat)

tensor([[ 4,  0,  1],
        [11,  0,  2],
        [ 7,  0,  2],
        ...,
        [14,  1,  3],
        [ 4,  0,  5],
        [12,  1,  2]])


  cat = torch.tensor(cat, dtype=torch.int64)


In [84]:
# Continuous columns - Simply map them into a numpy since they are already numerical values and neurons basicaly
# understand them.

cont = np.stack([df[col].values for col in continuous_cols], axis=1)
print(cont)

# To a 'tensor'.
cont = torch.tensor(cont, dtype=torch.float)
print(cont)

[[-73.992365    40.730521   -73.975499    40.744746     1.
    2.12631159]
 [-73.990078    40.740558   -73.974232    40.744114     1.
    1.39230687]
 [-73.994149    40.751118   -73.960064    40.766235     2.
    3.32676344]
 ...
 [-73.988574    40.749772   -74.011541    40.707799     3.
    5.05252282]
 [-74.004449    40.724529   -73.992697    40.730765     1.
    1.20892296]
 [-73.955415    40.77192    -73.967623    40.763015     3.
    1.42739869]]
tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000,   2.1263],
        [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000,   1.3923],
        [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000,   3.3268],
        ...,
        [-73.9886,  40.7498, -74.0115,  40.7078,   3.0000,   5.0525],
        [-74.0044,  40.7245, -73.9927,  40.7308,   1.0000,   1.2089],
        [-73.9554,  40.7719, -73.9676,  40.7630,   3.0000,   1.4274]])


In [88]:
# CREATE the labels using 'fare_amount' columns, hence need to predict 'fare_amount' base on categorical and continuous
# columns values after training.
y = torch.tensor(df['fare_amount'].values).reshape(-1, 1)
print(y)

tensor([[ 6.5000],
        [ 6.9000],
        [10.1000],
        ...,
        [12.5000],
        [ 4.9000],
        [ 5.3000]], dtype=torch.float64)


In [89]:
# After all the data prepared.
print(cat.shape)
print(cont.shape)
print(y.shape)

torch.Size([120000, 3])
torch.Size([120000, 6])
torch.Size([120000, 1])


In [91]:
# #################################################################################################
# SET EMBEDDING Sizes
# #################################################################################################
# 1. Step - Take category sizes.
cat_szs = [len(df[col].cat.categories) for col in categorical_cols]
print(cat_szs)

# 2. Step - Take embedded sizes.
embedded_szs = [(szs, min(50, (szs + 1) // 2)) for szs in cat_szs]
print(embedded_szs)

[24, 2, 7]
[(24, 12), (2, 1), (7, 4)]


In [92]:
print("""
# #################################################################################################
#  Full ANN Code Along - Regression Part Three(3) (Tabular Model)
# #################################################################################################
""")


# #################################################################################################
#  Full ANN Code Along - Regression Part Three(3) (Tabular Model)
# #################################################################################################

