# Memory Management

In [1]:
import pandas as pd
from src.core.paths import ORIGINAL_DATA_TYPES, ALTERED_DATA_TYPES
from src.core.miscellaneous import view_memory_usage, change_column_data_type, save_as_parquet

## 2014 - 2017 

### Dealing With Datetimes

#### 2014

In [2]:
trips_2014 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2014.parquet")
trips_2014.head(1)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,2355134,6/30/2014 23:57,7/1/2014 0:07,2006,604,131,Lincoln Ave & Belmont Ave,303,Broadway & Cornelia Ave,Subscriber,Male,1988.0


##### Changing Start & Stop Times to Datetime

In [3]:
trips_2014["starttime"] = pd.to_datetime(trips_2014["starttime"], format = "%m/%d/%Y %H:%M")
trips_2014["stoptime"] = pd.to_datetime(trips_2014["stoptime"], format = "%m/%d/%Y %H:%M")

#### 2015

In [4]:
trips_2015 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2015.parquet")
trips_2015.head(1)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,4738454,3/31/2015 23:58,4/1/2015 0:03,1095,299,117,Wilton Ave & Belmont Ave,300,Broadway & Barry Ave,Subscriber,Male,1994.0


##### Changing Start & Stop Times to Datetime

In [5]:
trips_2015["starttime"] = pd.to_datetime(trips_2015["starttime"], format = "%m/%d/%Y %H:%M")
trips_2015["stoptime"] = pd.to_datetime(trips_2015["stoptime"], format = "%m/%d/%Y %H:%M")

#### 2016

In [6]:
trips_2016 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2016.parquet")
trips_2016.head(1)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,9080551,3/31/2016 23:53,4/1/2016 0:07,155,841,344,Ravenswood Ave & Lawrence Ave,458,Broadway & Thorndale Ave,Subscriber,Male,1986.0


##### Changing Start & Stop Times to Datetime

In [7]:
trips_2016["starttime"] = pd.to_datetime(trips_2016["starttime"], format = "mixed")
trips_2016["stoptime"] = pd.to_datetime(trips_2016["stoptime"], format = "mixed")

#### 2017

In [8]:
trips_2017 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2017.parquet")
trips_2017.head(1)

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,13518905,3/31/2017 23:59:07,4/1/2017 00:13:24,5292,857,66,Clinton St & Lake St,171,May St & Cullerton St,Subscriber,Male,1989.0


##### Changing Start & Stop Times to Datetime

In [9]:
trips_2017["start_time"] = pd.to_datetime(trips_2017["start_time"], format = "mixed")
trips_2017["end_time"] = pd.to_datetime(trips_2017["end_time"], format = "mixed")

### Changing the Data Types of The Other Columns

##### Trip ID

In [10]:
pre_2018 = [trips_2014, trips_2015, trips_2016, trips_2017]

for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = ["trip_id"], to_format = "int32")

##### Trip Duration

In [11]:
pre_2018 = [trips_2014, trips_2015, trips_2016, trips_2017]

for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = ["tripduration"], to_format = "int16")

##### Birth Year

In [12]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "birthyear", to_format = "float32")

##### From Station ID

In [13]:
for dataset in pre_2018:
    change_column_data_type(data = trips_2014, columns = "from_station_id", to_format = "int16")

##### Bike ID

In [14]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "bikeid", to_format = "int16")

##### To Station ID

In [15]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "to_station_id", to_format = "int16")

##### From Station Name

In [16]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "from_station_name", to_format = "category")

##### To Station Name

In [17]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "to_station_name", to_format = "category")

##### Usertype

In [18]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "usertype", to_format = "category")

##### Gender

In [19]:
for dataset in pre_2018:
    change_column_data_type(data = dataset, columns = "gender", to_format = "category")

In [20]:
trips_2014.dtypes

trip_id                       int32
starttime            datetime64[ns]
stoptime             datetime64[ns]
bikeid                        int16
tripduration                  int16
from_station_id               int16
from_station_name          category
to_station_id                 int16
to_station_name            category
usertype                   category
gender                     category
birthyear                   float32
dtype: object

### Saving the data from 2014 to 2017 as parquet files

In [21]:
save_as_parquet(
    list_of_dataframes = pre_2018,
    folder_name = ALTERED_DATA_TYPES/"2014 - 2017"
    )  

4it [00:05,  1.30s/it]


## 2018 - 2019

### 2018 & 2019: Special Cases

In [2]:
trips_2018 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2018.parquet")
trips_2019 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2019.parquet")

#### Changing Start & Stop Times to Datetime

In [3]:
for dataset in [trips_2018, trips_2019]:

    dataset["01 - Rental Details Local Start Time"] = pd.to_datetime(
    dataset["01 - Rental Details Local Start Time"], format = "%Y-%m-%d %H:%M:%S"
    )


    dataset["01 - Rental Details Local End Time"] = pd.to_datetime(
        dataset["01 - Rental Details Local End Time"], format = "%Y-%m-%d %H:%M:%S"
        )


    dataset["start_time"] = pd.to_datetime(dataset["start_time"], format = "%Y-%m-%d %H:%M:%S")
    dataset["end_time"] = pd.to_datetime(dataset["end_time"], format = "%Y-%m-%d %H:%M:%S")

#### Changing the Data Types of The Other Columns

###### Changing the relevant columns into integer has posed a slight challenge because a lot of these columns have missing values. So this work will be done in the Data Cleaning notebook.

##### Categoricals

In [4]:
for dataset in [trips_2018, trips_2019]:
    
    change_column_data_type(
        data = dataset, 
        columns = [
            "Member Gender", "User Type", "gender", "usertype", "from_station_name", "to_station_name", 
            "03 - Rental Start Station Name", "02 - Rental End Station Name"
            ], 
        to_format = "category"
        )

In [5]:
trips_2018.dtypes

01 - Rental Details Rental ID                              float64
01 - Rental Details Local Start Time                datetime64[ns]
01 - Rental Details Local End Time                  datetime64[ns]
01 - Rental Details Bike ID                                float64
01 - Rental Details Duration In Seconds Uncapped            object
03 - Rental Start Station ID                               float64
03 - Rental Start Station Name                            category
02 - Rental End Station ID                                 float64
02 - Rental End Station Name                              category
User Type                                                 category
Member Gender                                             category
05 - Member Details Member Birthday Year                   float64
trip_id                                                    float64
start_time                                          datetime64[ns]
end_time                                            datetime64

#### Save data as parquet files

In [5]:
save_as_parquet(
    list_of_dataframes = [trips_2018, trips_2019],
    folder_name = ALTERED_DATA_TYPES/"2018 - 2019"
    )

2it [00:06,  3.12s/it]


## 2020 - 2023

### Dealing With Datetimes

#### 2020

In [2]:
trips_2020 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2020.parquet")
trips_2020.head(2)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,EACB19130B0CDA4A,docked_bike,2020-01-21 20:06:59,2020-01-21 20:14:30,Western Ave & Leland Ave,239.0,Clark St & Leland Ave,326.0,41.9665,-87.6884,41.9671,-87.6674,member
1,8FED874C809DC021,docked_bike,2020-01-30 14:22:39,2020-01-30 14:26:22,Clark St & Montrose Ave,234.0,Southport Ave & Irving Park Rd,318.0,41.9616,-87.666,41.9542,-87.6644,member


#### 2021

In [3]:
trips_2021 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2021.parquet")
trips_2021.head(2)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E19E6F1B8D4C42ED,electric_bike,2021-01-23 16:14:19,2021-01-23 16:24:44,California Ave & Cortez St,17660,,,41.900341,-87.696743,41.89,-87.72,member
1,DC88F20C2C55F27F,electric_bike,2021-01-27 18:43:08,2021-01-27 18:47:12,California Ave & Cortez St,17660,,,41.900333,-87.696707,41.9,-87.69,member


#### 2022

In [4]:
trips_2022 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2022.parquet")
trips_2022.head(2)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual


#### 2023

In [5]:
trips_2023 = pd.read_parquet(ORIGINAL_DATA_TYPES/"trips_2023.parquet")
trips_2023.head(2)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F96D5A74A3E41399,electric_bike,2023-01-21 20:05:42,2023-01-21 20:16:33,Lincoln Ave & Fullerton Ave,TA1309000058,Hampden Ct & Diversey Ave,202480.0,41.924074,-87.646278,41.93,-87.64,member
1,13CB7EB698CEDB88,classic_bike,2023-01-10 15:37:36,2023-01-10 15:46:05,Kimbark Ave & 53rd St,TA1309000037,Greenwood Ave & 47th St,TA1308000002,41.799568,-87.594747,41.809835,-87.599383,member


In [6]:
from_2020 = [trips_2020, trips_2021, trips_2022, trips_2023]

for dataset in from_2020:
    
    dataset["started_at"] = pd.to_datetime(dataset["started_at"], format = "mixed")
    dataset["ended_at"] = pd.to_datetime(dataset["ended_at"], format = "mixed")

### Changing the Data Types of The Other Columns

In [7]:
for dataset in from_2020:
    dataset.drop(["start_station_id", "end_station_id"], axis = 1, inplace = True)

##### Rideable Type & Member Type

In [8]:
for dataset in from_2020:
    change_column_data_type(data = dataset, columns = ["rideable_type", "member_casual"], to_format = "category")

##### Starting & Ending Station Names

In [9]:
for dataset in from_2020:
    change_column_data_type(data = dataset, columns = ["start_station_name", "end_station_name"], to_format = "category")

###### The Latitudes & Longitudes could not be changed into int16 values due to the presence of missing values. During the cleaning stage, this issue will be taken care of.

### Saving the data from 2020 to 2023 as pkls

In [10]:
save_as_parquet(
    list_of_dataframes = from_2020,
    folder_name = ALTERED_DATA_TYPES/"2020 - 2023"
    )

4it [00:11,  2.88s/it]
