This notebook contains code to serialize the downloaded monthly files for trips from the website for Bluebike demand forecasting.

Monthly zip files were downloaded from the [Bluebike website](https://www.bluebikes.com/system-data).

Csv files were extracted from the zip files and stored in a folder.These were then combined into a file using the below code.<br>
Relevant columns were kept, deleting the rest.<br>
The starttime and endtime datatype was converted to datetime objects<br>
The combined file was then pickled for sharing and using again in future.<br>



In [30]:
from google.colab import drive
drive.mount('/content/drive',force_remount = True)

Mounted at /content/drive


In [68]:
import pandas as pd
import os
import sys

In [69]:
# path to the folder containing monthly files
path = '/content/drive/MyDrive/blue_bikes_sales_prediction/data/monthly_data_for_trips'
all_files = [file for file in os.listdir(path) if file.endswith(".csv")]
all_files = sorted( all_files)
combined_csv = pd.concat( [ pd.read_csv(f'{path}/{f}') for f in all_files ] )


In [70]:
combined_csv.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,609,2021-05-01 00:00:01.0450,2021-05-01 00:10:10.7300,66,Commonwealth Ave at Griggs St,42.349225,-71.132753,400,Lansdowne T Stop,42.347345,-71.100168,4885,Subscriber,2134
1,632,2021-05-01 00:00:13.0880,2021-05-01 00:10:45.9060,409,Elm St at White St,42.389524,-71.116941,104,Harvard University Radcliffe Quadrangle at She...,42.380287,-71.125107,3844,Subscriber,2144
2,187,2021-05-01 00:00:20.0430,2021-05-01 00:03:27.7480,75,Lafayette Square at Mass Ave / Main St / Colum...,42.363465,-71.100573,178,MIT Pacific St at Purrington St,42.359573,-71.101295,6907,Subscriber,2139


In [71]:
combined_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 582862 entries, 0 to 311968
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tripduration             582862 non-null  int64  
 1   starttime                582862 non-null  object 
 2   stoptime                 582862 non-null  object 
 3   start station id         582862 non-null  int64  
 4   start station name       582862 non-null  object 
 5   start station latitude   582862 non-null  float64
 6   start station longitude  582862 non-null  float64
 7   end station id           582862 non-null  int64  
 8   end station name         582862 non-null  object 
 9   end station latitude     582862 non-null  float64
 10  end station longitude    582862 non-null  float64
 11  bikeid                   582862 non-null  int64  
 12  usertype                 582862 non-null  object 
 13  postal code              536434 non-null  object 
dtypes: f

In [72]:
## deleting the postal address
combined_csv.drop(['postal code'], inplace=True, axis= 1)

In [73]:
# convertinting starttime and stoptime to datetime objects
combined_csv['starttime'] = pd.to_datetime(combined_csv['starttime'])
combined_csv['stoptime'] = pd.to_datetime(combined_csv['stoptime'])
combined_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 582862 entries, 0 to 311968
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   tripduration             582862 non-null  int64         
 1   starttime                582862 non-null  datetime64[ns]
 2   stoptime                 582862 non-null  datetime64[ns]
 3   start station id         582862 non-null  int64         
 4   start station name       582862 non-null  object        
 5   start station latitude   582862 non-null  float64       
 6   start station longitude  582862 non-null  float64       
 7   end station id           582862 non-null  int64         
 8   end station name         582862 non-null  object        
 9   end station latitude     582862 non-null  float64       
 10  end station longitude    582862 non-null  float64       
 11  bikeid                   582862 non-null  int64         
 12  usertype        

In [74]:
file_name = 'all_data.pkl'
pkl_path = f'{path}/{file_name}'
combined_csv.to_pickle(pkl_path)

In [75]:
test = pd.read_pickle(pkl_path)
test.tail(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype
311966,1231,2021-06-30 23:59:43.219,2021-07-01 00:20:14.820,67,MIT at Mass Ave / Amherst St,42.3581,-71.093198,356,Charlestown Navy Yard,42.374125,-71.054812,2824,Subscriber
311967,738,2021-06-30 23:59:56.106,2021-07-01 00:12:14.641,5,Northeastern University - North Parking Lot,42.341814,-71.090179,52,Newbury St at Hereford St,42.348717,-71.085954,2512,Subscriber
311968,515,2021-06-30 23:59:58.556,2021-07-01 00:08:34.131,271,Ashmont T Stop,42.285694,-71.064139,260,Fields Corner T Stop,42.300664,-71.060295,3890,Subscriber


In [76]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 582862 entries, 0 to 311968
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   tripduration             582862 non-null  int64         
 1   starttime                582862 non-null  datetime64[ns]
 2   stoptime                 582862 non-null  datetime64[ns]
 3   start station id         582862 non-null  int64         
 4   start station name       582862 non-null  object        
 5   start station latitude   582862 non-null  float64       
 6   start station longitude  582862 non-null  float64       
 7   end station id           582862 non-null  int64         
 8   end station name         582862 non-null  object        
 9   end station latitude     582862 non-null  float64       
 10  end station longitude    582862 non-null  float64       
 11  bikeid                   582862 non-null  int64         
 12  usertype        