In [1]:
#Objective: predicting the trip duration accurately

Get relevant imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
class c:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    END = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

Extract data we've been given and divide into reasonable chunks

In [4]:
chunk_size = 50000 
file_path = 'training_dataset/training_dataset.csv'

chunks = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    chunks.append(chunk)

data_frame = pd.concat(chunks, ignore_index=True)


Process the data. Remove whatever that's uncessary and check for missing values. Either remove the missing or replace them. 

But first, check what we're missing and working with.

In [3]:
# Check for missing values
missing_values = data_frame.isnull().sum()
print("Number of missing values are: ", missing_values) 

print("-------------------------------------------------")

# Show summary statistics
print(data_frame.info())

Number of missing values are:  ID                             0
vendorid                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          3155336
trip_distance                  0
ratecodeid               3155336
store_and_fwd_flag       3155336
pulocationid                   0
dolocationid                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     3155336
airport_fee              3155336
duration                       0
dtype: int64
-------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32701098 entries, 0 to 32701097
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                   

We're missing a lot of data from these columns: 

1. passenger_count          3155336

2. ratecodeid               3155336

3. store_and_fwd_flag       3155336

4. congestion_surcharge     3155336

5. airport_fee              3155336


The first four columns are useless for the objective at hand. We'll drop them. 

Whilst we're at it we will remove all other columns that are not useful.

In [None]:
# YOUR CODE HERE
# data_frame.drop(columns=["ratecodeid", "payment_type", "congestion_surcharge", "passenger_count", 
#                          "ID", "airport_fee", "vendorid", "extra", "mta_tax", "tip_amount", "tolls_amount", 
#                          "improvement_surcharge", "fare_amount"], inplace=True)
data_frame.fillna(0, inplace=True)


Missing values have been addressed as well as useless columns. Now, we'll process the data that is relevant. 

In [9]:
print(data_frame.describe())

       trip_distance  pulocationid  dolocationid  total_amount      duration
count   3.270110e+07  3.270110e+07  3.270110e+07  3.270110e+07  3.270110e+07
mean    4.908534e+00  1.642556e+02  1.634615e+02  2.785063e+01  1.045067e+03
std     4.088654e+02  6.433638e+01  6.960137e+01  8.670093e+01  2.081190e+03
min     0.000000e+00  1.000000e+00  1.000000e+00 -2.265450e+03  0.000000e+00
25%     1.020000e+00  1.320000e+02  1.130000e+02  1.575000e+01  4.690000e+02
50%     1.760000e+00  1.610000e+02  1.620000e+02  2.100000e+01  7.770000e+02
75%     3.360000e+00  2.330000e+02  2.340000e+02  3.050000e+01  1.261000e+03
max     3.986086e+05  2.650000e+02  2.650000e+02  3.355509e+05  5.854240e+05


Check for weird values \
Process data \
Train-test-split \
Choose relevant models
- K-neighbour
- Random forest
- SVC
- ... 

Cross validation \
Draw on graph

In [9]:
# TODO Check for weird values and eliminate
column_names = ['trip_distance', 'pulocationid', 'dolocationid', 'total_amount', 'duration']

for column in column_names:
    print(f"{c.BOLD}{column}:{c.END}")
    print(f"{data_frame[column].value_counts()}")
    print()

[1mtrip_distance:[0m
trip_distance
0.00        619128
0.90        414888
1.00        412615
0.80        405658
1.10        401780
             ...  
57.69            1
87.99            1
85.83            1
4003.25          1
96873.59         1
Name: count, Length: 8239, dtype: int64

[1mpulocationid:[0m
pulocationid
132    1583292
161    1520484
237    1517936
236    1373254
162    1128587
        ...   
199         10
5            6
105          4
99           2
110          2
Name: count, Length: 263, dtype: int64

[1mdolocationid:[0m
dolocationid
236    1423669
237    1369419
161    1223021
230    1039033
170     967010
        ...   
44         143
2           66
99          10
105          5
110          2
Name: count, Length: 262, dtype: int64

[1mtotal_amount:[0m
total_amount
 16.80     424441
 12.60     389769
 21.00     366010
 15.12     240108
 15.96     238903
            ...  
 442.63         1
 273.06         1
-125.48         1
 272.22         1
-79.92          1


Here we can see that all headers are of type int64, which is good because we don't have to convert the values from numerical to categorical. \
It is visible that some trips are extremely long, such as the 96873 (assmued miles) which is unrealistic for a taxi trip in NYC. We can eliminate this, together with any other unrealistic values. \
We can identify these values using some simple statistical analysis.
[Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]


In [19]:
distance_Q1 = data_frame['trip_distance'].quantile(0.25)
distance_Q3 = data_frame['trip_distance'].quantile(0.75)
distance_IQR = distance_Q3 - distance_Q1

distance_lower_bound = distance_Q1 - 1.5 * distance_IQR
distance_upper_bound = distance_Q3 + 1.5 * distance_IQR

distance_outliers = data_frame[(data_frame['trip_distance'] < distance_lower_bound) | (data_frame['trip_distance'] > distance_upper_bound)]
print('number of distance outliers:', len(distance_outliers))

df_clean = data_frame[(data_frame['trip_distance'] >= distance_lower_bound) & (data_frame['trip_distance'] <= distance_upper_bound)]

number of distance outliers: 4184315


Now outliers for distance have been removed. Next we should look for outliers in duration. This is done in the same way to be consistent in the calculation.

In [20]:
duration_Q1 = df_clean['duration'].quantile(0.25)
duration_Q3 = df_clean['duration'].quantile(0.75)
duration_IQR = duration_Q3 - duration_Q1

duration_lower_bound = duration_Q1 - 1.5 * duration_IQR
duration_upper_bound = duration_Q3 + 1.5 * duration_IQR

duration_outliers = df_clean[(df_clean['duration'] < duration_lower_bound) | (df_clean['duration'] > duration_upper_bound)]
print('number of duration outliers:', len(duration_outliers))

df_clean = df_clean[(df_clean['duration'] >= duration_lower_bound) & (df_clean['duration'] <= duration_upper_bound)]

number of duration outliers: 826846


Now that distance and time outliers have been dealt with, next step is to train the models. 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

y = df_clean['duration']
X = df_clean[['trip_distance']]

# Feature engineering to account for pickup time
df_clean['pickup_hour'] = pd.to_datetime(df_clean['tpep_pickup_datetime']).dt.hour

# TODO: 
# SettingWithCopyWarning: 
# A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead
# See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X['pickup_hour'] = df_clean['pickup_hour']

# Since we want to use pickup and dropoff locations to account for higher-traffic areas we need to engineer that too. 
# For this we will use one-hot encoding of the location IDs which will help the model learn the effect of each pickup and dropoff zone.
df_clean = pd.get_dummies(df_clean, columns=['pulocationid', 'dolocationid'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    'SVC': SVC(random_state=42),
    'SVC-linear': LinearSVC(random_state=42, max_iter=2000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'Random Forest': RandomForestClassifier(random_state=42)
}

cv_results = {}
for name, model in classifiers.items():
    scores = cross_val_score(model, X, y, cv=10)
    cv_results[name] = np.mean(scores)

print('Model performance:')
for model, accuracy in cv_results.items():
    print(f"{model}: {accuracy: .4f}")

best_model = max(cv_results, key=cv_results.get)
print(f"\nBest model: {best_model} with accuracy {cv_results[best_model]:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['pickup_hour'] = df_clean['pickup_hour']
