In [None]:
import pandas as pd

# Load cleaned datasets
df_last12 = pd.read_csv('../../data/processed/cleaned_last12.csv')
df_longterm = pd.read_csv('../../data/processed/cleaned_longterm.csv')

last12 columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
longterm columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']


In [2]:
# Preview first few rows df_last12
df_last12.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E92D44EC0F271F6D,electric_bike,2024-05-01 00:01:58,2024-05-01 00:05:45,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.899015,-87.629916,41.907993,-87.631501,member
1,55FA739D575E7AE3,classic_bike,2024-05-01 00:02:35,2024-05-01 00:05:54,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.898969,-87.629912,41.907993,-87.631501,member
2,7A296E31619D21E9,classic_bike,2024-05-01 00:04:31,2024-05-01 00:20:13,Broadway & Barry Ave,13137,Broadway & Wilson Ave,13074,41.937582,-87.644098,41.965221,-87.658139,member
3,2FB602B952B83DB7,electric_bike,2024-05-01 00:04:34,2024-05-01 00:12:17,Broadway & Granville Ave,15571,Sheridan Rd & Loyola Ave,RP-009,41.99477,-87.660287,42.001044,-87.661198,casual
4,2D37389234DB639E,electric_bike,2024-05-01 00:05:13,2024-05-01 00:05:48,Western Ave & Ardmore Ave,464,Western Ave & Ardmore Ave,464,41.986607,-87.689669,41.986764,-87.68988,casual


In [3]:
# Preview first few rows df_longterm
df_longterm.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0D139A3203274B87,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member
1,C7AE8E9CDB197A8E,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,member
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,casual
4,6604F61AE4B14BC1,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member


### 🟨 Step 1: Check for Errors (Nulls, Duplicates, Unexpected Values)

In [5]:
# Check for nulls in each dataset
print("Null values in last12:\n", df_last12.isnull().sum())
print("\nNull values in longterm:\n", df_longterm.isnull().sum())

Null values in last12:
 ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

Null values in longterm:
 ride_id                 0
rideable_type           0
started_at              0
ended_at                0
start_station_name      0
start_station_id      120
end_station_name        0
end_station_id        130
start_lat               0
start_lng               0
end_lat               116
end_lng               116
member_casual           0
dtype: int64


In [6]:
# Check for duplicated ride_id (should be zero after cleaning)
print("\nDuplicates in last12:", df_last12.duplicated(subset='ride_id').sum())
print("Duplicates in longterm:", df_longterm.duplicated(subset='ride_id').sum())

# Unique values in member_casual
print("\nmember_casual values (last12):", df_last12['member_casual'].unique())
print("member_casual values (longterm):", df_longterm['member_casual'].unique())

# Unique values in rideable_type
print("\nmember_casual values (last12):", df_last12['rideable_type'].unique())
print("member_casual values (longterm):", df_longterm['rideable_type'].unique())


Duplicates in last12: 0
Duplicates in longterm: 0

member_casual values (last12): ['member' 'casual']
member_casual values (longterm): ['member' 'casual']

member_casual values (last12): ['electric_bike' 'classic_bike' 'electric_scooter']
member_casual values (longterm): ['classic_bike' 'electric_bike' 'docked_bike']


### ✅ Data Quality Summary

* **Null values**:

  * `last12`: **0 nulls** in all critical columns — dataset is fully complete.
  * `longterm`: nulls only in:

    * `start_station_id` (120 rows),
    * `end_station_id` (130 rows),
    * `end_lat`, `end_lng` (116 rows)
      → These fields are **not essential** for core analysis and can be retained as-is.

* **Duplicates**:

  * No duplicate `ride_id` values in either dataset.

* **`member_casual` values**:

  * ✅ Valid categories: only `'member'` and `'casual'` in both datasets.

* **`rideable_type` values**:

  * `last12`: `'electric_bike'`, `'classic_bike'`, `'electric_scooter'`
  * `longterm`: `'classic_bike'`, `'electric_bike'`, `'docked_bike'`
    → Minor variation due to changes in naming or vehicle types over time. All are valid.

---

📌 **Conclusion:**
Both datasets are consistent, clean, and ready for feature engineering. No action needed for nulls or categories. Proceed to calculating `ride_length`, `day_of_week`, and `hour_of_day`.


### 🟨 Step 2: Feature Engineering — Add ride_length, day_of_week, hour_of_day

In [8]:
# Convert timestamp columns (auto detect format even if .000 exists)
df_last12['started_at'] = pd.to_datetime(df_last12['started_at'], format='mixed')
df_last12['ended_at']   = pd.to_datetime(df_last12['ended_at'], format='mixed')

df_longterm['started_at'] = pd.to_datetime(df_longterm['started_at'], format='mixed')
df_longterm['ended_at']   = pd.to_datetime(df_longterm['ended_at'], format='mixed')


# Add derived fields
df_last12['ride_length'] = (df_last12['ended_at'] - df_last12['started_at']).dt.total_seconds() / 60
df_last12['day_of_week'] = df_last12['started_at'].dt.day_name()
df_last12['hour_of_day'] = df_last12['started_at'].dt.hour

df_longterm['ride_length'] = (df_longterm['ended_at'] - df_longterm['started_at']).dt.total_seconds() / 60
df_longterm['day_of_week'] = df_longterm['started_at'].dt.day_name()
df_longterm['hour_of_day'] = df_longterm['started_at'].dt.hour


In [9]:
# Preview first few rows df_last12
df_last12.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week,hour_of_day
0,E92D44EC0F271F6D,electric_bike,2024-05-01 00:01:58,2024-05-01 00:05:45,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.899015,-87.629916,41.907993,-87.631501,member,3.783333,Wednesday,0
1,55FA739D575E7AE3,classic_bike,2024-05-01 00:02:35,2024-05-01 00:05:54,Dearborn Pkwy & Delaware Pl,TA1307000128,Clark St & Schiller St,TA1309000024,41.898969,-87.629912,41.907993,-87.631501,member,3.316667,Wednesday,0
2,7A296E31619D21E9,classic_bike,2024-05-01 00:04:31,2024-05-01 00:20:13,Broadway & Barry Ave,13137,Broadway & Wilson Ave,13074,41.937582,-87.644098,41.965221,-87.658139,member,15.7,Wednesday,0
3,2FB602B952B83DB7,electric_bike,2024-05-01 00:04:34,2024-05-01 00:12:17,Broadway & Granville Ave,15571,Sheridan Rd & Loyola Ave,RP-009,41.99477,-87.660287,42.001044,-87.661198,casual,7.716667,Wednesday,0
4,2D37389234DB639E,electric_bike,2024-05-01 00:05:13,2024-05-01 00:05:48,Western Ave & Ardmore Ave,464,Western Ave & Ardmore Ave,464,41.986607,-87.689669,41.986764,-87.68988,casual,0.583333,Wednesday,0


In [10]:
# Preview first few rows df_longterm
df_longterm.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week,hour_of_day
0,0D139A3203274B87,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,6.25,Friday,0
1,C7AE8E9CDB197A8E,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,19.683333,Friday,0
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,member,0.2,Friday,0
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,casual,0.05,Friday,0
4,6604F61AE4B14BC1,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,31.166667,Friday,0


### 🟨 Step 3: Save Transformed Datasets

In [None]:
df_last12.to_csv('../../data/processed/processed_last12.csv', index=False)
df_longterm.to_csv('../../data/processed/processed_longterm.csv', index=False)

print("✅ Saved:")
print("- processed_last12.csv")
print("- processed_longterm.csv")

✅ Saved:
- processed_last12.csv
- processed_longterm.csv
