# STAGE 3c: PROCESS - Merge Cleaned Data to One Big CSV


## Getting Ready to Ingest the csv files and merge them to a big Dataframe

In [3]:
import pandas as pd
from pandas.api.types import CategoricalDtype

## Prepare categorical data types

station_ids = pd.read_csv('./Data/ref_lists/master_station_id_list.csv', index_col=0 , names= ['station_id'])
station_names = pd.read_csv('./Data/ref_lists/master_station_name_list.csv', index_col=0 , names= ['station_name'])
rideable_types = pd.read_csv('./Data/ref_lists/master_rideable_type_list.csv', index_col=0, names = ['rideable_type'])
member_types = pd.read_csv('./Data/ref_lists/master_member_casual_list.csv', index_col=0, names =['member_type'])

cat_station_id_types = CategoricalDtype(categories=station_ids['station_id'])
cat_station_name_types = CategoricalDtype(categories=station_names['station_name'])
cat_rideable_types = CategoricalDtype(categories=rideable_types['rideable_type'])
cat_member_types = CategoricalDtype(categories=member_types['member_type'])

## Prepare table schema, 
## usecols: to select the columns to be imported 
## dtypes: define column and datatype mapping
## parsed_dates for started_at and ened_at

dtypes = {'ride_id': 'str', 'rideable_type': cat_rideable_types,  'start_station_id': cat_station_id_types, 'start_station_name': cat_station_name_types, 
            'end_station_id': cat_station_id_types,'end_station_name': cat_station_name_types, 'member_casual':cat_member_types}
usecols = ['ride_id', 'rideable_type',  'start_station_id', 'start_station_name',
            'end_station_id', 'end_station_name','member_casual','started_at','ended_at','start_lat','start_lng','end_lat','end_lng']

# this is a function to read the trip files,
def read_csv_to_df(filename, dtype, usecols):
    df = pd.read_csv('./Data/cleaned_csv/'+filename, parse_dates=['started_at','ended_at'], dtype = dtype, usecols=usecols)
    print (filename,df.memory_usage(deep=True).sum()/1e+6, " Mbytes")
    return df

## import the file name list generated from Prepare stage
## file_list: list of trip data files from april 2020 to sep 2022

file_list_df = pd.read_csv('file_list_2020.csv', header=None, names= ['filename'])
file_list = file_list_df['filename'].values

## This is where we merge all the relevant csv files into one big dataframe

In [4]:
# Big merged data frame

dfbig = pd.DataFrame()


i = 0
for i in range(len(file_list)):
    print (i)
    filename = file_list[i]
    file_month = filename[0:6]
    df_file = read_csv_to_df(filename, dtypes, usecols)
   
    dfbig = pd.concat([dfbig, df_file], axis=0)
    i +=1
print (dfbig.memory_usage(deep=True).sum()/1e+6, 'M')


0
202004-divvy-tripdata.csv 11.632501  Mbytes
1
202005-divvy-tripdata.csv 26.567287  Mbytes
2
202006-divvy-tripdata.csv 45.082303  Mbytes
3
202007-divvy-tripdata.csv 71.798705  Mbytes
4
202008-divvy-tripdata.csv 76.877313  Mbytes
5
202009-divvy-tripdata.csv 63.743122  Mbytes
6
202010-divvy-tripdata.csv 43.52825  Mbytes
7
202011-divvy-tripdata.csv 28.716342  Mbytes
8
202012-divvy-tripdata.csv 14.700259  Mbytes
9
202101-divvy-tripdata.csv 10.885932  Mbytes
10
202102-divvy-tripdata.csv 5.838502  Mbytes
11
202103-divvy-tripdata.csv 26.098962  Mbytes
12
202104-divvy-tripdata.csv 37.96599  Mbytes
13
202105-divvy-tripdata.csv 58.069643  Mbytes
14
202106-divvy-tripdata.csv 78.234735  Mbytes
15
202107-divvy-tripdata.csv 88.855429  Mbytes
16
202108-divvy-tripdata.csv 86.885713  Mbytes
17
202109-divvy-tripdata.csv 79.749357  Mbytes
18
202110-divvy-tripdata.csv 61.230149  Mbytes
19
202111-divvy-tripdata.csv 32.718523  Mbytes
20
202112-divvy-tripdata.csv 22.421137  Mbytes
21
202201-divvy-tripdata.c

In [5]:
dfbig.dtypes

ride_id                       object
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name          category
start_station_id            category
end_station_name            category
end_station_id              category
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual               category
dtype: object

In [6]:
dfbig.to_csv('dfbig.csv')