In [135]:
import pandas as pd
import numpy as np
import uuid
import re 

# LOADING CSV TO DF

In [136]:
csvs = []

for n in range(1,13):
    filename = f'./bike-rental-starter-kit/data/JC-2016{str(n).zfill(2)}-citibike-tripdata.csv'
    df = pd.read_csv(filename)
    csvs.append(df)

weather = pd.read_csv('./bike-rental-starter-kit/data/newark_airport_2016.csv')

tripdata = pd.concat(csvs,ignore_index=True)

In [137]:

# Define a function to convert a string to snake case
def snake_case(s):
    return '_'.join(
        sub('([A-Z][a-z]+)', r' \1',
        sub('([A-Z]+)', r' \1',
        s.replace('-', ' '))).split()).lower()


# Normalizing

In [138]:

## TRIPDATA ------------

# Parsing int64 in Int64 to normalize .info()
for col in tripdata.columns:
    if tripdata[col].dtypes == 'int64':
        tripdata[col] = tripdata[col].astype('Int64')

# Normalize str to title format 
for col in tripdata.columns:
    if tripdata[col].dtypes == 'object':
        tripdata[col] = tripdata[col].str.title()

# Parsing Float64 in Int64 to normalize .info()
tripdata['Birth Year'] = tripdata['Birth Year'].astype('Int64')

# Erasing birth date for people birth before 1916
tripdata[tripdata['Birth Year']<=1916] = None

# Mapping Int64 to str values
tripdata['Gender'] = tripdata['Gender'].map({2:'female', 1:'male'})

# Keeping only trip duration under 7days long
tripdata = tripdata[tripdata['Trip Duration'] <= 60*60*24*7]


# Creating an UUID for tripdata
tripdata['Ride ID'] = [str(uuid.uuid4()) for _ in range(len(tripdata))]

# Mapping User Type to Casual/Member

tripdata['User Type'] = tripdata['User Type'].map({'Subscriber':'Casual', 'Customer':'Member'})


tripdata.rename(columns={
    'Trip Duration':'Trip Duration', #KEEP
    'Start Time':'Started at',
    'Stop Time':'Ended at',
    'Start Station ID':'Start station ID',
    'Start Station Name':'Start station name',
    'Start Station Latitude':'Start latitude',
    'Start Station Longitude':'Start longitude',
    'End Station ID':'End station ID',
    'End Station Name':'End station name',
    'End Station Latitude':'End latitude',
    'End Station Longitude':'End Longitude',
    'Bike ID':'Bike ID', #KEEP 
    'User Type':'User Type', #KEEP Name -- Member or casual ride
    'Birth Year':'Birth Year', #KEEP
    'Gender':'Gender', #KEEP
    'Ride ID':'Ride ID' #Created
    # Missing -- Rideable type
})

weather.rename(columns={
    'station':'Station ID'
    ###
})

# Snake Casing the columns name
tripdata.columns = [snake_case(column) for column in tripdata.columns]


## WEATHER -------------

# Splitting Name/Region from intial Name column
weather['SPLIT'] = weather['NAME'].str.split(',')

if 'REGION' in weather.columns:
    pass
else:
    weather['NAME'] = weather['SPLIT'].str[0]
    weather['REGION'] = weather['SPLIT'].str[1]

# Delete SPLIT if exists
if 'SPLIT' in weather.columns:
    weather.drop('SPLIT', axis=1, inplace=True)
else:
    pass

# Normalize str to title format 
weather['NAME'] = weather['NAME'].str.title()

# Drop columns where all values are NaN
for col in weather.columns:
    if weather[col].isnull().sum() == len(weather[col]):
        weather.drop(col,axis=1, inplace=True)

# Adding Lat/Long for weather df
weather['Latitude'] = 40.689531
weather['Longitude'] =-74.174462

# Snake Casing the columns name
weather.columns = [snake_case(column) for column in weather.columns]


In [139]:
weather.head(10)

Unnamed: 0,station,name,date,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5,region,latitude,longitude
0,USW00014734,Newark Liberty International Airport,2016-01-01,12.75,0.0,0.0,0.0,41,43,34,270,280.0,25.9,35.1,NJ US,40.689531,-74.174462
1,USW00014734,Newark Liberty International Airport,2016-01-02,9.4,0.0,0.0,0.0,36,42,30,260,260.0,21.0,25.1,NJ US,40.689531,-74.174462
2,USW00014734,Newark Liberty International Airport,2016-01-03,10.29,0.0,0.0,0.0,37,47,28,270,250.0,23.9,30.0,NJ US,40.689531,-74.174462
3,USW00014734,Newark Liberty International Airport,2016-01-04,17.22,0.0,0.0,0.0,32,35,14,330,330.0,25.9,33.1,NJ US,40.689531,-74.174462
4,USW00014734,Newark Liberty International Airport,2016-01-05,9.84,0.0,0.0,0.0,19,31,10,360,350.0,25.1,31.1,NJ US,40.689531,-74.174462
5,USW00014734,Newark Liberty International Airport,2016-01-06,5.37,0.0,0.0,0.0,28,42,15,230,250.0,12.1,16.1,NJ US,40.689531,-74.174462
6,USW00014734,Newark Liberty International Airport,2016-01-07,3.36,0.0,0.0,0.0,35,46,24,20,360.0,8.9,10.1,NJ US,40.689531,-74.174462
7,USW00014734,Newark Liberty International Airport,2016-01-08,8.05,0.0,0.0,0.0,38,45,31,20,30.0,14.1,16.1,NJ US,40.689531,-74.174462
8,USW00014734,Newark Liberty International Airport,2016-01-09,6.71,0.01,0.0,0.0,44,48,38,60,70.0,13.0,17.0,NJ US,40.689531,-74.174462
9,USW00014734,Newark Liberty International Airport,2016-01-10,15.43,1.77,0.0,0.0,53,65,39,260,270.0,36.0,42.9,NJ US,40.689531,-74.174462


In [140]:
tripdata.head(10)

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender,ride_id
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St Path,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Casual,1964.0,female,eb239653-bafe-4fc6-94ed-3244850e0c13
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St Path,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Casual,1962.0,male,9509176c-5afd-4f7a-85c2-cb42f1139e1f
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St Path,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Casual,1962.0,female,9c49ae7e-c8cb-4e49-9159-57aa807c4513
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Casual,1984.0,male,0fc27402-d7c4-49cb-9c37-369cf0ce8d17
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Member,,,37495461-426d-421c-b554-2359590f8cd9
5,883,2016-01-01 01:03:28,2016-01-01 01:18:11,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24442,Member,,,3228dcda-c228-43a0-b793-ba5f9c378b2e
6,445,2016-01-01 01:07:45,2016-01-01 01:15:11,3186,Grove St Path,40.719586,-74.043117,3203,Hamilton Park,40.727596,-74.044247,24510,Casual,1988.0,female,12013b38-93c6-486d-9f84-1ec05fd8a22f
7,192,2016-01-01 01:18:51,2016-01-01 01:22:03,3211,Newark Ave,40.721525,-74.046305,3203,Hamilton Park,40.727596,-74.044247,24625,Casual,1980.0,male,fda51a84-fa2b-4098-afcb-ae89f1addd18
8,409,2016-01-01 01:23:44,2016-01-01 01:30:34,3187,Warren St,40.721124,-74.038051,3214,Essex Light Rail,40.712774,-74.036486,24429,Casual,1990.0,male,ed873a21-dd3a-4fee-818f-56aa5b7fb886
9,285,2016-01-01 01:25:12,2016-01-01 01:29:57,3187,Warren St,40.721124,-74.038051,3214,Essex Light Rail,40.712774,-74.036486,24407,Casual,1988.0,female,bb7191c6-4483-41ad-876a-62a7b11e655e


In [141]:
tripdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247569 entries, 0 to 247583
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   trip_duration            247569 non-null  Int64  
 1   start_time               247569 non-null  object 
 2   stop_time                247569 non-null  object 
 3   start_station_id         247569 non-null  Int64  
 4   start_station_name       247569 non-null  object 
 5   start_station_latitude   247569 non-null  float64
 6   start_station_longitude  247569 non-null  float64
 7   end_station_id           247569 non-null  Int64  
 8   end_station_name         247569 non-null  object 
 9   end_station_latitude     247569 non-null  float64
 10  end_station_longitude    247569 non-null  float64
 11  bike_id                  247569 non-null  Int64  
 12  user_type                247189 non-null  object 
 13  birth_year               228574 non-null  Int64  
 14  gend

In [142]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   station    366 non-null    object 
 1   name       366 non-null    object 
 2   date       366 non-null    object 
 3   awnd       366 non-null    float64
 4   prcp       366 non-null    float64
 5   snow       366 non-null    float64
 6   snwd       366 non-null    float64
 7   tavg       366 non-null    int64  
 8   tmax       366 non-null    int64  
 9   tmin       366 non-null    int64  
 10  wdf2       366 non-null    int64  
 11  wdf5       364 non-null    float64
 12  wsf2       366 non-null    float64
 13  wsf5       364 non-null    float64
 14  region     366 non-null    object 
 15  latitude   366 non-null    float64
 16  longitude  366 non-null    float64
dtypes: float64(9), int64(4), object(4)
memory usage: 48.7+ KB


In [143]:
tripdata.describe()

Unnamed: 0,trip_duration,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bike_id,birth_year
count,247569.0,247569.0,247569.0,247569.0,247569.0,247569.0,247569.0,247569.0,228574.0
mean,733.235546,3207.064782,40.723121,-74.046438,3203.571336,40.722594,-74.045855,24935.269448,1979.335707
std,3686.68033,26.954622,0.008198,0.011211,61.579176,0.007958,0.011283,748.47928,9.595408
min,61.0,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1934.0
25%,248.0,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0
50%,390.0,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0
75%,666.0,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0
max,488819.0,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0


In [144]:
weather.describe()

Unnamed: 0,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5,latitude,longitude
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,364.0,366.0,364.0,366.0,366.0
mean,9.429973,0.104945,0.098087,0.342623,57.196721,65.991803,48.459016,217.84153,228.269231,20.484426,26.801648,40.68953,-74.17446
std,3.748174,0.307496,1.276498,2.07851,17.466981,18.606301,17.13579,102.548282,97.415777,6.84839,8.88261,7.115154e-15,1.423031e-14
min,2.46,0.0,0.0,0.0,8.0,18.0,0.0,10.0,10.0,6.9,10.1,40.68953,-74.17446
25%,6.765,0.0,0.0,0.0,43.0,51.25,35.0,150.0,150.0,15.0,19.9,40.68953,-74.17446
50%,8.72,0.0,0.0,0.0,56.0,66.0,47.0,240.0,260.0,19.9,25.1,40.68953,-74.17446
75%,11.41,0.03,0.0,0.0,74.0,83.0,64.0,300.0,300.0,23.9,31.1,40.68953,-74.17446
max,22.82,2.79,24.0,20.1,89.0,99.0,80.0,360.0,360.0,48.1,66.0,40.68953,-74.17446
