### Import libraries

In [133]:
import pandas as pd
import os

### Read and concat citibike data in one dataframe

In [134]:
citi_bike_dir = './data'
files = []

for file in os.listdir(citi_bike_dir):
    with open(os.path.join(citi_bike_dir, file)) as f:
        if 'citibike' in file:
            dataframe = pd.read_csv(f'{citi_bike_dir}/{file}')
            files.append(dataframe)

citi_bike_df = pd.concat(files, ignore_index=True)

### Transform and clean data

In [135]:
citi_bike_df = citi_bike_df.rename(columns={
                                            'Trip Duration':'trip_duration',
                                            'Start Time':'start_time',
                                            'Stop Time':'stop_time',
                                            'Start Station ID':'start_station_id',
                                            'Start Station Name':'start_station_name',
                                            'Start Station Latitude':'start_station_latitude',
                                            'Start Station Longitude':'start_station_longitude',
                                            'End Station ID':'end_station_id',
                                            'End Station Name':'end_station_name',
                                            'End Station Latitude':'start_station_latitude',
                                            'End Station Longitude':'start_station_longitude',
                                            'Bike ID':'bike_id',
                                            'User Type':'user_type',
                                            'Birth Year':'birth_year',
                                            'Gender':'gender'
})

In [136]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,start_station_latitude.1,start_station_longitude.1,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0


In [137]:
citi_bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247584 entries, 0 to 247583
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   trip_duration            247584 non-null  int64  
 1   start_time               247584 non-null  object 
 2   stop_time                247584 non-null  object 
 3   start_station_id         247584 non-null  int64  
 4   start_station_name       247584 non-null  object 
 5   start_station_latitude   247584 non-null  float64
 6   start_station_longitude  247584 non-null  float64
 7   end_station_id           247584 non-null  int64  
 8   end_station_name         247584 non-null  object 
 9   start_station_latitude   247584 non-null  float64
 10  start_station_longitude  247584 non-null  float64
 11  bike_id                  247584 non-null  int64  
 12  user_type                247204 non-null  object 
 13  birth_year               228585 non-null  float64
 14  gend

In [138]:
citi_bike_df.isna().sum()

trip_duration                  0
start_time                     0
stop_time                      0
start_station_id               0
start_station_name             0
start_station_latitude         0
start_station_longitude        0
end_station_id                 0
end_station_name               0
start_station_latitude         0
start_station_longitude        0
bike_id                        0
user_type                    380
birth_year                 18999
gender                         0
dtype: int64

In [139]:
citi_bike_df.describe()

Unnamed: 0,trip_duration,start_station_id,start_station_latitude,start_station_longitude,end_station_id,start_station_latitude.1,start_station_longitude.1,bike_id,birth_year,gender
count,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,228585.0,247584.0
mean,885.6305,3207.065206,40.723121,-74.046438,3203.572553,40.722594,-74.045855,24935.260481,1979.335276,1.123534
std,35937.98,26.955103,0.008199,0.011211,61.579494,0.007958,0.011283,748.469712,9.596809,0.518687
min,61.0,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1900.0,0.0
25%,248.0,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0,1.0
50%,390.0,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0,1.0
75%,666.0,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0,1.0
max,16329810.0,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0,2.0


In [140]:
citi_bike_df['user_type'].unique()

array(['Subscriber', 'Customer', nan], dtype=object)

In [141]:
citi_bike_df['user_type'] = citi_bike_df['user_type'].fillna('Unspecified')

In [142]:
citi_bike_df['user_type'].unique()

array(['Subscriber', 'Customer', 'Unspecified'], dtype=object)

In [143]:
citi_bike_df['birth_year'] = citi_bike_df['birth_year'].fillna(0).astype(int)

In [144]:
citi_bike_df.isna().sum()

trip_duration              0
start_time                 0
stop_time                  0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
start_station_latitude     0
start_station_longitude    0
bike_id                    0
user_type                  0
birth_year                 0
gender                     0
dtype: int64

In [145]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,start_station_latitude.1,start_station_longitude.1,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,0


In [146]:
citi_bike_df['gender'].unique()

array([2, 1, 0])

In [147]:
citi_bike_df['gender'] = citi_bike_df['gender'].replace(0,'Unknown').astype(str)
citi_bike_df['gender'] = citi_bike_df['gender'].replace('1','Male').astype(str)
citi_bike_df['gender'] = citi_bike_df['gender'].replace('2','Female').astype(str)


In [148]:
citi_bike_df['gender'].unique()

array(['Female', 'Male', 'Unknown'], dtype=object)

In [149]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,start_station_latitude.1,start_station_longitude.1,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,Female
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,Male
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,Female
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,Male
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,Unknown


In [151]:
for col in citi_bike_df:
    try:
        if citi_bike_df[col] == 'object':
            citi_bike_df[col] = citi_bike_df[col].str.strip()
    except:
        pass

In [152]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,start_station_latitude.1,start_station_longitude.1,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,Female
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,Male
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,Female
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,Male
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,Unknown


### Read weather data