# Data Cleaning

In [1]:
import pandas as pd
from src.paths import ALTERED_DATA_TYPES, TEMPORARY_DATA, CLEANED_DATA

In [2]:
trips_2023 = pd.read_parquet(ALTERED_DATA_TYPES/"2020 - 2023/trips_2023.parquet")

In [3]:
trips_2023.isna().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    875716
start_station_id      875848
end_station_name      929202
end_station_id        929343
start_lat                  0
start_lng                  0
end_lat                 6990
end_lng                 6990
member_casual              0
dtype: int64

### Removing some unnecessary features, and renaming others

In [4]:
trips_2023.drop(
    columns = [
        "ride_id", "rideable_type", "member_casual", "start_station_id", "end_station_id"
    ], inplace = True)

trips_2023.rename(
        columns = {
            "started_at": "start_time", 
            "ended_at" : "stop_time",
            "start_station_name" : "from_station_name",
            "end_station_name" : "to_station_name", 
            "start_lat" : "start_latitude",
            "start_lng" : "start_longitude",
            "end_lat" : "stop_latitude",
            "end_lng" : "stop_longitude",
            },
        inplace = True
    )

#### Checking out the missing values

In [5]:
trips_2023.isna().sum()

start_time                0
stop_time                 0
from_station_name    875716
to_station_name      929202
start_latitude            0
start_longitude           0
stop_latitude          6990
stop_longitude         6990
dtype: int64

###### For each year, there are a couple of thousand trips for which the destination's coordinates and names are unknown.

In [6]:
trips_2023[
    pd.isnull(trips_2023["stop_latitude"])
]

Unnamed: 0,start_time,stop_time,from_station_name,to_station_name,start_latitude,start_longitude,stop_latitude,stop_longitude
69506,2023-01-01 04:45:39,2023-01-02 05:45:28,State St & Van Buren St,,41.877181,-87.627844,,
69523,2023-01-09 14:20:41,2023-01-15 04:19:09,Indiana Ave & Roosevelt Rd,,41.867888,-87.623041,,
69531,2023-01-30 13:24:22,2023-01-31 14:24:09,State St & 19th St,,41.856594,-87.627542,,
69952,2023-01-31 09:21:10,2023-02-01 10:21:02,Halsted St & Maxwell St,,41.864883,-87.647071,,
70167,2023-01-07 12:52:32,2023-01-08 06:47:21,New St & Illinois St,,41.890847,-87.618617,,
...,...,...,...,...,...,...,...,...
210132,2023-12-08 13:16:13,2023-12-09 14:15:52,900 W Harrison St,,41.874754,-87.649807,,
210171,2023-12-21 21:13:52,2023-12-22 22:13:48,Damen Ave & Madison St,,41.881370,-87.674930,,
210172,2023-12-21 21:14:03,2023-12-22 22:13:59,Damen Ave & Madison St,,41.881370,-87.674930,,
210173,2023-12-21 21:13:48,2023-12-22 22:13:40,Damen Ave & Madison St,,41.881370,-87.674930,,


In [7]:
trips_2023.drop(
        columns = ["from_station_name", "to_station_name"],
        inplace = True
    )

In [8]:
trips_2023.dropna()

Unnamed: 0,start_time,stop_time,start_latitude,start_longitude,stop_latitude,stop_longitude
0,2023-01-21 20:05:42,2023-01-21 20:16:33,41.924074,-87.646278,41.930000,-87.640000
1,2023-01-10 15:37:36,2023-01-10 15:46:05,41.799568,-87.594747,41.809835,-87.599383
2,2023-01-02 07:51:57,2023-01-02 08:05:11,42.008571,-87.690483,42.039742,-87.699413
3,2023-01-22 10:52:58,2023-01-22 11:01:44,41.799568,-87.594747,41.809835,-87.599383
4,2023-01-12 13:58:01,2023-01-12 14:13:20,41.799568,-87.594747,41.809835,-87.599383
...,...,...,...,...,...,...
224068,2023-12-07 13:15:24,2023-12-07 13:17:37,41.874702,-87.649804,41.874640,-87.657030
224069,2023-12-08 18:42:21,2023-12-08 18:45:56,41.874754,-87.649807,41.874640,-87.657030
224070,2023-12-05 14:09:11,2023-12-05 14:13:01,41.874754,-87.649807,41.874640,-87.657030
224071,2023-12-02 21:36:07,2023-12-02 21:53:45,41.881396,-87.674984,41.885492,-87.652289


##### Duplicate Values

In [9]:
trips_2023.drop_duplicates(inplace = True)

## Final Data

In [10]:
trips_2023.to_parquet(path = CLEANED_DATA/"final.parquet")

In [11]:
trips_2023.head()

Unnamed: 0,start_time,stop_time,start_latitude,start_longitude,stop_latitude,stop_longitude
0,2023-01-21 20:05:42,2023-01-21 20:16:33,41.924074,-87.646278,41.93,-87.64
1,2023-01-10 15:37:36,2023-01-10 15:46:05,41.799568,-87.594747,41.809835,-87.599383
2,2023-01-02 07:51:57,2023-01-02 08:05:11,42.008571,-87.690483,42.039742,-87.699413
3,2023-01-22 10:52:58,2023-01-22 11:01:44,41.799568,-87.594747,41.809835,-87.599383
4,2023-01-12 13:58:01,2023-01-12 14:13:20,41.799568,-87.594747,41.809835,-87.599383
