# Data Cleaning

In [46]:
import pandas as pd
from src.core.paths import RAW_DATA_DIR, ALTERED_DATA_PICKLES, CLEANED_DATA
from src.core.data import save_as_pkl, find_first_nan, change_column_data_type

## 2014 - 2017

In [47]:
trips_2014 = pd.read_pickle(ALTERED_DATA_PICKLES/"2014 - 2017/0.pkl")
trips_2015 = pd.read_pickle(ALTERED_DATA_PICKLES/"2014 - 2017/1.pkl")
trips_2016 = pd.read_pickle(ALTERED_DATA_PICKLES/"2014 - 2017/2.pkl")
trips_2017 = pd.read_pickle(ALTERED_DATA_PICKLES/"2014 - 2017/3.pkl")

pre_2018 = [trips_2014, trips_2015, trips_2016, trips_2017]

##### Dropping Irrelevant Columns

###### All four of these dataframes have the same column names. I am removing the station IDs (for the station at the beginning of each trip) because I will be more interested in deriving the latitudes and longitudes of each station

In [48]:
# Removing said columns
for dataset in pre_2018:

    dataset.drop(
        columns = ["trip_id", "bikeid", "from_station_id", "to_station_id", "birthyear", "gender"],
        inplace = True
    )

##### Renaming certain columns

In [49]:
for dataset in pre_2018:

    # Rename some of the columns
    dataset.rename(
        columns = {
            "tripduration" : "trip_duration (seconds)",
            "usertype" : "user_type",
            "birthyear" : "birth_year",
            "starttime" : "start_time",
            "stoptime" : "stop_time"
        }, inplace = True
    )

##### Checking for Missing Values

In [50]:
for dataset in pre_2018:

    dataset.isna().sum() 
    print("######################")

######################
######################
######################
######################


##### Checking for Duplicates

In [51]:
# Use a list comprehension to view the number of duplicated observations in each dataset
[
    dataset[dataset.duplicated(keep = "last") == True].shape[0] for dataset in pre_2018
]

[3132, 3583, 799353, 5152]

###### All four datasets from 2014 - 2017 contain duplicated observations, so we remove them in the next step

In [52]:
for dataset in pre_2018:

    dataset.drop_duplicates(inplace = True)

## 2018 - 2019

### 2018

In [53]:
trips_2018 = pd.read_pickle(ALTERED_DATA_PICKLES/"2018 - 2019/0.pkl")
trips_2018.columns

Index(['01 - Rental Details Rental ID', '01 - Rental Details Local Start Time',
       '01 - Rental Details Local End Time', '01 - Rental Details Bike ID',
       '01 - Rental Details Duration In Seconds Uncapped',
       '03 - Rental Start Station ID', '03 - Rental Start Station Name',
       '02 - Rental End Station ID', '02 - Rental End Station Name',
       'User Type', 'Member Gender',
       '05 - Member Details Member Birthday Year', 'trip_id', 'start_time',
       'end_time', 'bikeid', 'tripduration', 'from_station_id',
       'from_station_name', 'to_station_id', 'to_station_name', 'usertype',
       'gender', 'birthyear'],
      dtype='object')

In [54]:
trips_2018.drop(
    columns = ["Member Gender", "05 - Member Details Member Birthday Year", "trip_id", "from_station_id", "to_station_id",
               "birthyear", "gender", "01 - Rental Details Rental ID", "03 - Rental Start Station ID", "02 - Rental End Station ID"], 
    inplace = True
    )

#### Viewing the Data

In [55]:
trips_2018

Unnamed: 0,01 - Rental Details Local Start Time,01 - Rental Details Local End Time,01 - Rental Details Bike ID,01 - Rental Details Duration In Seconds Uncapped,03 - Rental Start Station Name,02 - Rental End Station Name,User Type,start_time,end_time,bikeid,tripduration,from_station_name,to_station_name,usertype
0,2018-01-01 00:12:00,2018-01-01 00:17:23,3304.0,323.0,Damen Ave & Pierce Ave,Claremont Ave & Hirsch St,Subscriber,NaT,NaT,,,,,
1,2018-01-01 00:41:35,2018-01-01 00:47:52,5367.0,377.0,Winthrop Ave & Lawrence Ave,Clark St & Winnemac Ave (Temp),Subscriber,NaT,NaT,,,,,
2,2018-01-01 00:44:46,2018-01-01 01:33:10,4599.0,2904.0,LaSalle St & Washington St,Troy St & North Ave,Subscriber,NaT,NaT,,,,,
3,2018-01-01 00:53:10,2018-01-01 01:05:37,2302.0,747.0,Rush St & Hubbard St,Larrabee St & Oak St,Subscriber,NaT,NaT,,,,,
4,2018-01-01 00:53:37,2018-01-01 00:56:40,3696.0,183.0,Blue Island Ave & 18th St,Paulina St & 18th St,Subscriber,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642681,NaT,NaT,,,,,,2018-12-31 23:45:17,2018-12-31 23:50:05,2931.0,288.0,Dearborn St & Monroe St,Franklin St & Lake St,Subscriber
642682,NaT,NaT,,,,,,2018-12-31 23:48:48,2018-12-31 23:57:22,4386.0,514.0,Dearborn St & Van Buren St (*),State St & Randolph St,Subscriber
642683,NaT,NaT,,,,,,2018-12-31 23:50:09,2018-12-31 23:57:16,4927.0,427.0,Federal St & Polk St,Michigan Ave & Lake St,Subscriber
642684,NaT,NaT,,,,,,2018-12-31 23:55:04,2018-12-31 23:58:24,1350.0,200.0,Clark St & Lincoln Ave,Sedgwick St & North Ave,Subscriber


###### Looking at the structure of 2018's dataframe, it would appear that this dataframe it is a diagonal matrix. It has 3,603,082 rows. Let us consider how many missing values there are in the "lower right" section of what appears to be a diagonal matrix above.

In [56]:
trips_2018.iloc[:,7:].isna().sum()

start_time           387145
end_time             387145
bikeid               387145
tripduration         387145
from_station_name    387145
to_station_name      387145
usertype             387145
dtype: int64

###### As stated, I suspect that this is a diagonal matrix. To confirm this, I have to investigate the number of missing values before and after the line that divides the data into two vertical sections. We see for instance that there are 387,145 missing values in each of the rows of this "lower right" section of the matrix.

###### Let us check how many missing values there are on the "upper left" section of the matrix

In [57]:
trips_2018.iloc[:,:7].isna().sum()

01 - Rental Details Local Start Time                3215937
01 - Rental Details Local End Time                  3215937
01 - Rental Details Bike ID                         3215937
01 - Rental Details Duration In Seconds Uncapped    3215937
03 - Rental Start Station Name                      3215937
02 - Rental End Station Name                        3215937
User Type                                           3215937
dtype: int64

###### Notice that we would have a major piece of supporting evidence (in favour of my suspicion) if the sum of the number of missing values from the two sections equalled the total number of missing values in the whole matrix. This is exactly the case, since 3,215,937 + 387,145 = 3,603,082.

###### We need to find out on which row (of the "lower left" dataframe) the missing values start, so that we can see where this empty block of data begins. It would be a mistake to assume that the missing values on the "lower left" start at the half-way point (row-wise). 

In [58]:
# Find the first row where a missing value occurs
find_first_nan(data = trips_2018, missing = True, just_reveal = True)

387145


###### The first missing value occurs on row #387,145. And for further confirmation, let us check whether there are any non-missing values after this row.

In [59]:
find_first_nan(
    data = trips_2018.iloc[387145:,:],
    missing = False,
    just_reveal = True
)

###### Just to be thorough, let's see when the missing values on the "top left" end

In [60]:
# Find the first row where a missing value occurs
right_half_2018 = [find_first_nan(data = trips_2018.iloc[:,i:], missing = False, just_reveal = False) for i in range(7,14)]
right_half_2018

[387145, 387145, 387145, 387145, 387145, 387145, 387145]

###### They end at the same place where they begin on the "top right"

##### Forming the Final 2018 Dataset

In [61]:
top_left_2018 = trips_2018.iloc[:387145, :7]
bottom_right_2018 = trips_2018.iloc[387145:, 7:]

# Final Renaming of columns
top_left_2018.rename(columns = {
                    "01 - Rental Details Local Start Time": "start_time",
                    "01 - Rental Details Local End Time": "end_time",
                    "01 - Rental Details Bike ID": "bike_id",
                    "01 - Rental Details Duration In Seconds Uncapped": "trip_duration (seconds)",
                    "03 - Rental Start Station Name": "from_station_name",
                    "02 - Rental End Station Name": "to_station_name",
                    "User Type": "user_type"}, inplace = True)


bottom_right_2018.rename(columns = {
                        "bikeid": "bike_id",
                        "tripduration": "trip_duration (seconds)",
                        "usertype": "user_type"}, inplace = True)


In [62]:
trips_2018 = pd.concat([top_left_2018, bottom_right_2018], axis = 0, ignore_index = True)

### 2019

#### Viewing the Data

In [63]:
trips_2019 = pd.read_pickle(ALTERED_DATA_PICKLES/"2018 - 2019/1.pkl")
trips_2019.columns

Index(['trip_id', 'start_time', 'end_time', 'bikeid', 'tripduration',
       'from_station_id', 'from_station_name', 'to_station_id',
       'to_station_name', 'usertype', 'gender', 'birthyear',
       '01 - Rental Details Rental ID', '01 - Rental Details Local Start Time',
       '01 - Rental Details Local End Time', '01 - Rental Details Bike ID',
       '01 - Rental Details Duration In Seconds Uncapped',
       '03 - Rental Start Station ID', '03 - Rental Start Station Name',
       '02 - Rental End Station ID', '02 - Rental End Station Name',
       'User Type', 'Member Gender',
       '05 - Member Details Member Birthday Year'],
      dtype='object')

In [64]:
trips_2019.reset_index(inplace = True)
trips_2019.columns

Index(['index', 'trip_id', 'start_time', 'end_time', 'bikeid', 'tripduration',
       'from_station_id', 'from_station_name', 'to_station_id',
       'to_station_name', 'usertype', 'gender', 'birthyear',
       '01 - Rental Details Rental ID', '01 - Rental Details Local Start Time',
       '01 - Rental Details Local End Time', '01 - Rental Details Bike ID',
       '01 - Rental Details Duration In Seconds Uncapped',
       '03 - Rental Start Station ID', '03 - Rental Start Station Name',
       '02 - Rental End Station ID', '02 - Rental End Station Name',
       'User Type', 'Member Gender',
       '05 - Member Details Member Birthday Year'],
      dtype='object')

###### Let us remove the columns that we are not going to use

In [65]:
trips_2019.drop(
    columns = [
        "trip_id", "gender", "birthyear", "01 - Rental Details Rental ID", "01 - Rental Details Bike ID",
        "03 - Rental Start Station ID", "02 - Rental End Station ID", "Member Gender", "bikeid",
        "05 - Member Details Member Birthday Year", "to_station_id", "from_station_id", "index"
    ], inplace = True
)

##### Dealing With Missing Values

###### The 2019 dataframe has 3,818,004 rows. Let us check for the number of missing values 

In [66]:
trips_2019.isna().sum()

start_time                                          1108163
end_time                                            1108163
tripduration                                        1108163
from_station_name                                   1108163
to_station_name                                     1108163
usertype                                            1108163
01 - Rental Details Local Start Time                2709841
01 - Rental Details Local End Time                  2709841
01 - Rental Details Duration In Seconds Uncapped    2709841
03 - Rental Start Station Name                      2709841
02 - Rental End Station Name                        2709841
User Type                                           2709841
dtype: int64

###### As with the 2018 data, there are (in reality) only 6 columns here, but those columns have been duplicated under different names. The number of rows has been divided in the following manner: in the first 6 columns, 1,108,163 of the values are missing values. The second set of 6 columns (which I consider to be a duplicate of the first set) there are 2,709,841 missing values.

###### Let us have a look at the exact structure of these blocks of missing values is.

In [67]:
# Produce a list consisting of the row index on which a NaN first occurs for each column
first_nan_locations_2019 = [
    find_first_nan(data = trips_2019.iloc[:,i:], missing = True, just_reveal = False) for i in range(0,12)
]

In [68]:
# View said list 
first_nan_locations_2019

[365069, 365069, 365069, 365069, 365069, 365069, 0, 0, 0, 0, 0, 0]

###### The missing values in the first set of 6 columns begin on row #356,069. Those in the second set of 6 columns start from the beginning. So let us invert the question by asking when the non-missing values in the data start?

In [69]:
# Produce a list consisting of the row index on which a non-NaN first occurs for each column
first_non_nan_locations_2019 = [
    find_first_nan(data = trips_2019.iloc[:,i:], missing = False, just_reveal = False) for i in range(0,12)
] 

In [70]:
# Again, view said list
first_non_nan_locations_2019

[0, 0, 0, 0, 0, 0, 365069, 365069, 365069, 365069, 365069, 365069]

###### The non-missing values in the second set of 6 columns also begin on row #356,069. This suggests that the missing values in one set of 6 columns are possibly present in the other set. If true, this would mean that, as a result of the evident duplication of columns, there are in fact no missing values in the data.

###### Let us check whether the number of missing values remains constant for every row. This would lend some credence to our budding theory.

In [71]:
for i in first_nan_locations_2019[:6]:

    row_nan_count = 0
    
    for j in range(0,12):

        if pd.isnull(trips_2019.iloc[i,j]) == True:

            row_nan_count += 1

    print(f"There are {row_nan_count} missing values on row #{i}")

There are 6 missing values on row #365069
There are 6 missing values on row #365069
There are 6 missing values on row #365069
There are 6 missing values on row #365069
There are 6 missing values on row #365069
There are 6 missing values on row #365069


###### Every row that contains missing values contains exactly 6 of them. Let us go one step further to confirm the theory. I will begin to isolate the values in the left half of the dataset, and check whether they are all missing. We have already seen that there are 1,108,163 missing values in the left half of the data, and we have seen that the missing values start from row #365069 for its 6 columns. What we want to confirm now is whether or not these 1,108,163 missing values all occur exactly one after the other in a single unbroken sequence.

In [72]:
# Isolate the missing values in the left half of the dataset
trips_2019.iloc[365069: 365069 + 1108163, :6].isna().sum()

start_time           1108163
end_time             1108163
tripduration         1108163
from_station_name    1108163
to_station_name      1108163
usertype             1108163
dtype: int64

###### Indeed they do. 

###### Let us investigate the missing values in the second half of the data. To begin with, we know that the first 365,069 values are missing. But we also know that there are 2,709,841 missing values in each column of the second half of the data. We know that 365,070 is not a missing value. But where are the remaining 2,344,772 values (per column)?

In [73]:
trips_2019.iloc[:365069, 6:12].isna().sum()

01 - Rental Details Local Start Time                365069
01 - Rental Details Local End Time                  365069
01 - Rental Details Duration In Seconds Uncapped    365069
03 - Rental Start Station Name                      365069
02 - Rental End Station Name                        365069
User Type                                           365069
dtype: int64

In [74]:
# Produce a list consisting of the row index on which a NaN first occurs for each column
find_first_nan(data = trips_2019.iloc[365069:, 6:12], missing = True, just_reveal = True)

1108163


In [75]:
find_first_nan(data = trips_2019.iloc[1108163:, 6:12], missing = True, just_reveal = True)

365069


###### We have found the remaining 2,344,772 missing values

In [76]:
trips_2019.iloc[365069+1108163:, 6:12].isna().sum()

01 - Rental Details Local Start Time                2344772
01 - Rental Details Local End Time                  2344772
01 - Rental Details Duration In Seconds Uncapped    2344772
03 - Rental Start Station Name                      2344772
02 - Rental End Station Name                        2344772
User Type                                           2344772
dtype: int64

###### Below, we see that these values are present in the same rows and in the preceding six columns.

In [77]:
trips_2019.iloc[365069+1108163:, :6].isna().sum()

start_time           0
end_time             0
tripduration         0
from_station_name    0
to_station_name      0
usertype             0
dtype: int64

##### Forming the Final 2019 Dataset

In [78]:
trips_2019.rename(columns = {"tripduration": "trip_duration (seconds)", "usertype": "user_type"}, inplace = True)

In [79]:
top_left_2019 = trips_2019.iloc[:365069,:6]
bottom_left_2019 = trips_2019.iloc[365069+1108163:,:6]
right_side_2019 = trips_2019.iloc[365069:365069+1108163,6:]

In [91]:
right_side_2019.columns

Index(['start_time', 'stop_time', 'trip_duration (seconds)',
       'from_station_name', 'to_station_name', 'user_type'],
      dtype='object')

###### We need to rename the columns 

In [80]:
right_side_2019.rename(
    columns = {"01 - Rental Details Local Start Time": "start_time", "01 - Rental Details Local End Time": "stop_time", 
               "01 - Rental Details Duration In Seconds Uncapped": "trip_duration (seconds)", 
               "03 - Rental Start Station Name": "from_station_name", "02 - Rental End Station Name": "to_station_name",
               "User Type": "user_type" 
               }, 

    inplace = True
)

###### We attach these components together

In [81]:
trips_2019 = pd.concat(
    [
        top_left_2019, bottom_left_2019, right_side_2019
    ], axis = 0
)

In [82]:
trips_2019

Unnamed: 0,start_time,end_time,trip_duration (seconds),from_station_name,to_station_name,user_type,stop_time
0,2019-01-01 00:04:37,2019-01-01 00:11:07,390.0,Wabash Ave & Grand Ave,Milwaukee Ave & Grand Ave,Subscriber,NaT
1,2019-01-01 00:08:13,2019-01-01 00:15:34,441.0,State St & Randolph St,Dearborn St & Van Buren St (*),Subscriber,NaT
2,2019-01-01 00:13:23,2019-01-01 00:27:12,829.0,Racine Ave & 18th St,Western Ave & Fillmore St (*),Subscriber,NaT
3,2019-01-01 00:13:45,2019-01-01 00:43:28,1783.0,California Ave & Milwaukee Ave,Clark St & Elm St,Subscriber,NaT
4,2019-01-01 00:14:52,2019-01-01 00:20:56,364.0,Mies van der Rohe Way & Chicago Ave,Streeter Dr & Grand Ave,Subscriber,NaT
...,...,...,...,...,...,...,...
1473227,2019-06-30 23:58:00,NaT,1342.0,Halsted St & Archer Ave,Halsted St & Archer Ave,Customer,2019-07-01 00:20:22
1473228,2019-06-30 23:58:04,NaT,973.0,Wabash Ave & Grand Ave,Calumet Ave & 18th St,Subscriber,2019-07-01 00:14:17
1473229,2019-06-30 23:58:20,NaT,1651.0,Greenview Ave & Diversey Pkwy,Clark St & Chicago Ave,Subscriber,2019-07-01 00:25:51
1473230,2019-06-30 23:58:52,NaT,752.0,Racine Ave & Belmont Ave,Broadway & Barry Ave,Subscriber,2019-07-01 00:11:24


###### That "stop_time" column has a bunch of missing values. Let us find out where the first non-NaNs start

In [83]:
for i in range(trips_2019.shape[0]):

    if pd.isnull(trips_2019.iloc[i,6]) == False:

        print(i)
        break

2709841


###### That "end_time" column has a bunch of missing values, and their positions may mirror the non-NaNs. Let us find out where these NaNs start.

In [84]:
for i in range(trips_2019.shape[0]):

    if pd.isnull(trips_2019.iloc[i,1]) == True:

        print(i)
        break

2709841


###### It's seems to be an exact match. We will merge these values and place them in the "end_time" column, and delete "stop_time"

In [85]:
trips_2019["end_time"] = pd.concat([trips_2019.iloc[:2709841,1], trips_2019.iloc[2709841:, 6]], axis = 0).values
trips_2019.drop("stop_time", axis = 1, inplace = True)

###### I almost forgot to remove duplicate values

In [86]:
# View the duplicate values
trips_2019[trips_2019.duplicated(keep = "first") == True]

Unnamed: 0,start_time,end_time,trip_duration (seconds),from_station_name,to_station_name,user_type
441,2019-01-01 11:55:20,2019-01-01 12:24:44,1764.0,Michigan Ave & Pearson St,Pine Grove Ave & Waveland Ave,Subscriber
516,2019-01-01 12:44:38,2019-01-01 12:50:10,332.0,Clinton St & Roosevelt Rd,Wabash Ave & Roosevelt Rd,Subscriber
610,2019-01-01 13:40:29,2019-01-01 13:50:07,578.0,Ashland Ave & Wrightwood Ave,Sheffield Ave & Willow St,Subscriber
1463,2019-01-02 06:19:14,2019-01-02 06:22:27,193.0,Clark St & Ida B Wells Dr,Franklin St & Jackson Blvd,Subscriber
3407,2019-01-02 13:57:27,2019-01-02 14:04:33,426.0,Desplaines St & Jackson Blvd,Wells St & Polk St,Subscriber
...,...,...,...,...,...,...
1471192,2019-06-30 18:58:34,2019-06-30 19:13:53,919.0,Wentworth Ave & 35th St,Wabash Ave & 16th St,Subscriber
1471213,2019-06-30 19:00:39,2019-06-30 19:31:15,1836.0,Washtenaw Ave & Lawrence Ave,Winchester (Ravenswood) Ave & Balmoral Ave,Subscriber
1471641,2019-06-30 19:35:48,2019-06-30 19:49:25,817.0,Loomis St & Lexington St,Green St & Madison St,Subscriber
1472635,2019-06-30 21:26:03,2019-06-30 21:38:06,723.0,Halsted St & Clybourn Ave (*),Desplaines St & Kinzie St,Subscriber


In [87]:
trips_2019.drop_duplicates(inplace = True)

###### We need to make "trip_duration" a column of floats.

In [43]:
# Pandas' ".astype" method wasn't working, as it viewed the contents of the column as strings.
trips_2019["trip_duration (seconds)"] = trips_2019["trip_duration (seconds)"].apply(lambda x: float(x.split()[0].replace(',', '')))

### Geocoding the data from 2014 - 2019

In [101]:
pre_2020 = [trips_2014, trips_2015, trips_2016, trips_2017, trips_2018, trips_2019]
trips_pre_2020 = pd.concat(pre_2020, axis = 0)

origins = list(trips_pre_2020["from_station_name"].unique())

##### Obtain the Coordinates for Each Point of Origin 

In [104]:
from geopy.geocoders import ArcGIS

geolocator = ArcGIS()origins_with_points
origins_with_points = []

# Get the ArcGis geocoder to obtain the coordinates for each place
for zone in origins[:len(origins)]:

    origins_with_points.append(geolocator.geocode(zone, timeout = None))


# View an element of the list
origins_with_points[0]

Location(Lincoln Ave, Belmont, California, 94002, (37.518965164186, -122.308262778663, 0.0))

In [108]:
origins_with_points

[Location(Lincoln Ave, Belmont, California, 94002, (37.518965164186, -122.308262778663, 0.0)),
 Location(Maxwell Street Express, (41.681652, -87.642383, 0.0)),
 Location(Sheffield Dr, Sylva, North Carolina, 28779, (35.349064425149, -83.211726045728, 0.0)),
 Location(Jackson St, Quapaw, Oklahoma, 74363, (36.958959992956, -94.78611499294, 0.0)),
 Location(Lexington Cir, Loomis, California, 95650, (38.795820785478, -121.129912635553, 0.0)),
 None,
 Location(Canal Rd, Madison, New York, 13402, (42.907764679069, -75.527052108737, 0.0)),
 Location(Grand St, Wabash, Indiana, 46992, (40.80038252922, -85.854300613052, 0.0)),
 None,
 Location(Ravenswood Ct, Montrose, Melbourne, Victoria, 3765, (-37.806213675303, 145.349073443124, 0.0)),
 Location(State, (25.44764, 81.78619, 0.0)),
 Location(Canal St, Missouri Valley, Iowa, 51555, (41.549860743524, -95.897406579417, 0.0)),
 Location(Merchandise Mart, (41.88843, -87.63678, 0.0)),
 Location(Park Ave, Middletown, California, 95461, (38.754552728047,

## 2020 - 2023

In [86]:
trips_2020 = pd.read_pickle(ALTERED_DATA_PICKLES/"2020 - 2023/0.pkl")
trips_2021 = pd.read_pickle(ALTERED_DATA_PICKLES/"2020 - 2023/1.pkl")
trips_2022 = pd.read_pickle(ALTERED_DATA_PICKLES/"2020 - 2023/2.pkl")
trips_2023 = pd.read_pickle(ALTERED_DATA_PICKLES/"2020 - 2023/3.pkl")

In [None]:
post_2020 = [trips_2020, trips_2021, trips_2022, trips_2023]

for dataset in post_2020:

    dataset.drop(columns = ["ride_id", "rideable_type"], inplace = True)

    dataset.rename(
        columns = {
            "started_at": "start_time", 
            "ended_at" : "stop_time",
            "member_casual" : "user_type"
            },
        inplace = True
    )

In [None]:
for dataset in post_2020:

    print(dataset.isna().sum())
    
    print("\n")
    print("############################")
    print("\n")

start_time    0
stop_time     0
start_lat     0
start_lng     0
end_lat       0
end_lng       0
user_type     0
dtype: int64


#############################


start_time    0
stop_time     0
start_lat     0
start_lng     0
end_lat       0
end_lng       0
user_type     0
dtype: int64


#############################


start_time    0
stop_time     0
start_lat     0
start_lng     0
end_lat       0
end_lng       0
user_type     0
dtype: int64


#############################


start_time    0
stop_time     0
start_lat     0
start_lng     0
end_lat       0
end_lng       0
user_type     0
dtype: int64


#############################




###### For each year, there are a couple of thousand trips for which the destination's coordinates and names are unknown.

In [None]:
for dataset in post_2020:    
    
    dataset.drop(
        dataset[pd.isnull(dataset["end_lat"]) == True].index,
        inplace = True
    )

In [None]:
for dataset in post_2020:

    print(dataset.isna().sum())

    print("############################")
    print("\n")

start_time                 0
stop_time                  0
start_station_name     93953
end_station_name      105860
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
user_type                  0
dtype: int64
#############################


start_time                 0
stop_time                  0
start_station_name    687681
end_station_name      731601
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
user_type                  0
dtype: int64
#############################


start_time                 0
stop_time                  0
start_station_name    828073
end_station_name      882455
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
user_type                  0
dtype: int64
#############################


start_time                 0
stop_time                  0
start_station_name    83

###### Since the remaining rows all feature latitudes and longitudes, I no longer need the station names

In [None]:
for dataset in post_2020:

    dataset.drop(
        columns = ["start_station_name", "end_station_name"], 
        inplace = True
    )

##### Duplicate Values

In [None]:
for dataset in post_2020:    
    dataset.drop_duplicates(inplace = True)