In [14]:
# feature_engg.ipynb

import pandas as pd
import numpy as np

# -----------------------------
# Load cleaned dataset
# -----------------------------
date_cols = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
df = pd.read_csv("data/tripfare_cleaned.csv", parse_dates=date_cols, low_memory=False)

print("Initial shape:", df.shape)
df.head()

Initial shape: (208024, 22)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration_min,trip_distance,pickup_hour,am_pm
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,-73.976746,40.765152,1,N,-74.004265,40.746128,...,0.5,0.5,2.05,0.0,0.3,12.35,7.916667,3.138096,0,AM
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,-73.983482,40.767925,1,N,-74.005943,40.733166,...,0.5,0.5,3.05,0.0,0.3,15.35,11.1,4.303331,0,AM
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,2,-73.782021,40.64481,1,N,-73.974541,40.67577,...,0.5,0.5,8.0,0.0,0.3,63.8,31.1,16.600142,0,AM
3,1,2016-03-01 00:00:01,2016-03-01 00:16:04,1,-73.788773,40.647758,1,N,-73.829208,40.712345,...,0.5,0.5,0.0,0.0,0.3,21.8,16.05,7.950066,0,AM
4,1,2016-03-01 00:00:01,2016-03-01 00:05:00,1,-73.958221,40.764641,1,N,-73.967896,40.762901,...,0.5,0.5,2.0,0.0,0.3,8.8,4.983333,0.837395,0,AM


In [15]:
# -----------------------------
# Derived Features
# -----------------------------

# a. Pickup Day & Weekend
df['pickup_day'] = df['tpep_pickup_datetime'].dt.day_name()
df['is_weekend'] = df['pickup_day'].isin(['Saturday','Sunday']).astype(int)

# b. Night Trip Flag (10 PM - 5 AM)
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['is_night'] = df['pickup_hour'].apply(lambda x: 1 if (x >= 22 or x < 5) else 0)

# c. Rush Hour Flag (Morning 7-10 AM, Evening 4-7 PM)
df['is_rush_hour'] = df['pickup_hour'].apply(lambda x: 1 if (7 <= x <=10 or 16 <= x <=19) else 0)

# d. Fare per km / per min (avoid division by zero)
df['fare_per_km'] = df['fare_amount'] / df['trip_distance'].replace(0,1)
df['fare_per_min'] = df['fare_amount'] / df['trip_duration_min'].replace(0,1)

In [16]:
# -----------------------------
# Encode only categorical variables for linear models
# -----------------------------
# One-hot encode non-ordinal categorical columns used in linear models

categorical_vars = ['am_pm','RatecodeID','payment_type']
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

In [17]:
print(df[['pickup_day','is_weekend','pickup_hour','is_night','is_rush_hour','fare_per_km','fare_per_min']].head())
print("Shape after feature engineering:", df.shape)


  pickup_day  is_weekend  pickup_hour  is_night  is_rush_hour  fare_per_km  \
0    Tuesday           0            0         1             0     2.867981   
1    Tuesday           0            0         1             0     2.556159   
2    Tuesday           0            0         1             0     3.283104   
3    Tuesday           0            0         1             0     2.578595   
4    Tuesday           0            0         1             0     6.567990   

   fare_per_min  
0      1.136842  
1      0.990991  
2      1.752412  
3      1.277259  
4      1.103679  
Shape after feature engineering: (208024, 34)


In [18]:
# -----------------------------
# Save feature-engineered dataset
# -----------------------------

df.to_csv("data/tripfare_feature_engg.csv", index=False)
print("Feature-engineered dataset saved as 'data/tripfare_feature_engg.csv'")


Feature-engineered dataset saved as 'data/tripfare_feature_engg.csv'
