### Import libraries

In [69]:
import pandas as pd
import os

### Helper functions

In [70]:
def create_df(directory, keyword):  
    files = []
    for file in os.listdir(directory):
        with open(os.path.join(directory, file)) as f:
            if keyword in file:
                dataframe = pd.read_csv(f'{directory}/{file}')
                files.append(dataframe)
    return pd.concat(files, ignore_index=True)


def remove_whitespaces_df(df):
    for col in df:
        try:
            if df[col] == 'object':
                df[col] = df[col].str.strip()
        except:
            pass

### Read, transform and clean citibike data

In [71]:
citi_bike_dir = './data'
citi_bike_keyword = 'citibike'
citi_bike_df = create_df(citi_bike_dir, citi_bike_keyword)

In [72]:
citi_bike_df = citi_bike_df.rename(columns={
                                            'Trip Duration':'trip_duration',
                                            'Start Time':'start_time',
                                            'Stop Time':'stop_time',
                                            'Start Station ID':'start_station_id',
                                            'Start Station Name':'start_station_name',
                                            'Start Station Latitude':'start_station_latitude',
                                            'Start Station Longitude':'start_station_longitude',
                                            'End Station ID':'end_station_id',
                                            'End Station Name':'end_station_name',
                                            'End Station Latitude':'end_station_latitude',
                                            'End Station Longitude':'end_station_longitude',
                                            'Bike ID':'bike_id',
                                            'User Type':'user_type',
                                            'Birth Year':'birth_year',
                                            'Gender':'gender'
})

In [73]:
citi_bike_df.head(5000)

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,565,2016-01-16 22:20:18,2016-01-16 22:29:44,3202,Newport PATH,40.727224,-74.033759,3211,Newark Ave,40.721525,-74.046305,24544,Subscriber,1983.0,2
4996,85,2016-01-16 22:21:09,2016-01-16 22:22:35,3202,Newport PATH,40.727224,-74.033759,3199,Newport Pkwy,40.728745,-74.032108,24572,Subscriber,1976.0,1
4997,344,2016-01-16 22:22:20,2016-01-16 22:28:04,3187,Warren St,40.721124,-74.038051,3199,Newport Pkwy,40.728745,-74.032108,24501,Subscriber,1979.0,2
4998,328,2016-01-16 22:22:32,2016-01-16 22:28:00,3187,Warren St,40.721124,-74.038051,3199,Newport Pkwy,40.728745,-74.032108,24407,Subscriber,1976.0,1


In [74]:
citi_bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247584 entries, 0 to 247583
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   trip_duration            247584 non-null  int64  
 1   start_time               247584 non-null  object 
 2   stop_time                247584 non-null  object 
 3   start_station_id         247584 non-null  int64  
 4   start_station_name       247584 non-null  object 
 5   start_station_latitude   247584 non-null  float64
 6   start_station_longitude  247584 non-null  float64
 7   end_station_id           247584 non-null  int64  
 8   end_station_name         247584 non-null  object 
 9   end_station_latitude     247584 non-null  float64
 10  end_station_longitude    247584 non-null  float64
 11  bike_id                  247584 non-null  int64  
 12  user_type                247204 non-null  object 
 13  birth_year               228585 non-null  float64
 14  gend

In [75]:
citi_bike_df['start_time'] = pd.to_datetime(citi_bike_df['start_time'])
citi_bike_df['stop_time'] = pd.to_datetime(citi_bike_df['stop_time'])

In [76]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0


In [77]:
citi_bike_df.isna().sum()

trip_duration                  0
start_time                     0
stop_time                      0
start_station_id               0
start_station_name             0
start_station_latitude         0
start_station_longitude        0
end_station_id                 0
end_station_name               0
end_station_latitude           0
end_station_longitude          0
bike_id                        0
user_type                    380
birth_year                 18999
gender                         0
dtype: int64

In [78]:
citi_bike_df.describe()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bike_id,birth_year,gender
count,247584.0,247584,247584,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,228585.0,247584.0
mean,885.6305,2016-07-29 05:55:07.541335040,2016-07-29 06:09:53.671073536,3207.065206,40.723121,-74.046438,3203.572553,40.722594,-74.045855,24935.260481,1979.335276,1.123534
min,61.0,2016-01-01 00:02:52,2016-01-01 00:08:54,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1900.0,0.0
25%,248.0,2016-05-27 07:46:06,2016-05-27 07:54:40.249999872,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0,1.0
50%,390.0,2016-08-10 09:23:50,2016-08-10 09:34:32.500000,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0,1.0
75%,666.0,2016-10-05 17:25:05.500000,2016-10-05 17:33:00.750000128,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0,1.0
max,16329810.0,2016-12-31 23:44:50,2017-01-18 14:26:46,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0,2.0
std,35937.98,,,26.955103,0.008199,0.011211,61.579494,0.007958,0.011283,748.469712,9.596809,0.518687


In [79]:
citi_bike_df['user_type'].unique()

array(['Subscriber', 'Customer', nan], dtype=object)

In [80]:
citi_bike_df['user_type'] = citi_bike_df['user_type'].fillna('Unspecified')

In [81]:
citi_bike_df['user_type'].unique()

array(['Subscriber', 'Customer', 'Unspecified'], dtype=object)

In [82]:
citi_bike_df['birth_year'] = citi_bike_df['birth_year'].fillna(0).astype(int)

In [83]:
citi_bike_df.isna().sum()

trip_duration              0
start_time                 0
stop_time                  0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
birth_year                 0
gender                     0
dtype: int64

In [84]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,0


In [85]:
citi_bike_df['gender'].unique()

array([2, 1, 0])

In [86]:
citi_bike_df['gender'] = citi_bike_df['gender'].replace(0,'Unknown').astype(str)
citi_bike_df['gender'] = citi_bike_df['gender'].replace('1','Male').astype(str)
citi_bike_df['gender'] = citi_bike_df['gender'].replace('2','Female').astype(str)


In [87]:
citi_bike_df['gender'].unique()

array(['Female', 'Male', 'Unknown'], dtype=object)

In [88]:
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,Female
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,Male
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,Female
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,Male
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,Unknown


In [89]:
remove_whitespaces_df(citi_bike_df)

In [90]:
citi_bike_df.head(50000)

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,Female
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,Male
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,Female
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,Male
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,0,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,160,2016-05-10 05:50:09,2016-05-10 05:52:49,3213,Van Vorst Park,40.718489,-74.047727,3186,Grove St PATH,40.719586,-74.043117,24453,Subscriber,1989,Male
49996,538,2016-05-10 05:53:30,2016-05-10 06:02:29,3192,Liberty Light Rail,40.711242,-74.055701,3183,Exchange Place,40.716247,-74.033459,24568,Subscriber,1960,Male
49997,613,2016-05-10 05:53:53,2016-05-10 06:04:06,3191,Union St,40.718211,-74.083639,3195,Sip Ave,40.730743,-74.063784,24469,Customer,0,Unknown
49998,154,2016-05-10 06:00:02,2016-05-10 06:02:36,3214,Essex Light Rail,40.712774,-74.036486,3183,Exchange Place,40.716247,-74.033459,24551,Subscriber,1974,Male


### Separate citibike data

In [91]:
start_stations_df = citi_bike_df.loc[:,['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude']].drop_duplicates()
start_stations_df = start_stations_df.rename(columns={'start_station_id':'station_id',
                                                      'start_station_name':'station_name',
                                                      'start_station_latitude':'station_latitude',
                                                      'start_station_longitude':'station_longitude'
                                                      })
start_stations_df.head()

Unnamed: 0,station_id,station_name,station_latitude,station_longitude
0,3186,Grove St PATH,40.719586,-74.043117
3,3209,Brunswick St,40.724176,-74.050656
4,3195,Sip Ave,40.730743,-74.063784
7,3211,Newark Ave,40.721525,-74.046305
8,3187,Warren St,40.721124,-74.038051


In [92]:
end_stations_df = citi_bike_df.loc[:,['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude']].drop_duplicates()
end_stations_df = end_stations_df.rename(columns={'end_station_id':'station_id',
                                                      'end_station_name':'station_name',
                                                      'end_station_latitude':'station_latitude',
                                                      'end_station_longitude':'station_longitude'
                                                      })
end_stations_df.head()

Unnamed: 0,station_id,station_name,station_latitude,station_longitude
0,3209,Brunswick St,40.724176,-74.050656
1,3213,Van Vorst Park,40.718489,-74.047727
3,3203,Hamilton Park,40.727596,-74.044247
4,3210,Pershing Field,40.742677,-74.051789
8,3214,Essex Light Rail,40.712774,-74.036486


In [93]:
bike_stations_df = pd.concat([start_stations_df, end_stations_df]).drop_duplicates()
bike_stations_df.head()

Unnamed: 0,station_id,station_name,station_latitude,station_longitude
0,3186,Grove St PATH,40.719586,-74.043117
3,3209,Brunswick St,40.724176,-74.050656
4,3195,Sip Ave,40.730743,-74.063784
7,3211,Newark Ave,40.721525,-74.046305
8,3187,Warren St,40.721124,-74.038051


In [94]:
citi_bike_df = citi_bike_df.drop('start_station_name', axis=1)
citi_bike_df = citi_bike_df.drop('start_station_latitude', axis=1)
citi_bike_df = citi_bike_df.drop('start_station_longitude', axis=1)
citi_bike_df = citi_bike_df.drop('end_station_name', axis=1)
citi_bike_df = citi_bike_df.drop('end_station_latitude', axis=1)
citi_bike_df = citi_bike_df.drop('end_station_longitude', axis=1)
citi_bike_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,end_station_id,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,3209,24647,Subscriber,1964,Female
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,3213,24605,Subscriber,1962,Male
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,3213,24689,Subscriber,1962,Female
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,3203,24693,Subscriber,1984,Male
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,3210,24573,Customer,0,Unknown


### Read, transform and clean weather data

In [95]:
weather_dir = './data'
weather_keyword = 'newark'
weather_df = create_df(weather_dir, weather_keyword)
remove_whitespaces_df(weather_df)

In [96]:
for col in weather_df:
    weather_df = weather_df.rename(columns={col: col.lower()})

In [97]:
weather_df = weather_df.rename(columns={'awnd':'avg_daily_wind_speed',
                                        'pgtm':'peak_gust_time',
                                        'prcp':'precipitation',
                                        'snwd':'snow_depth',
                                        'tavg':'avg_hourly_temp',
                                        'tmax':'max_hourly_temp',
                                        'tmin':'min_hourly_temp',
                                        'tsun':'daily_sun_hours',
                                        'wdf2':'fastest_2_min_wind_dir',
                                        'wdf5':'fastest_5_sec_wind_dir',
                                        'wsf2':'fastest_2_min_wind_speed',
                                        'wsf5':'fastest_5_sec_wind_speed'
                                        })
weather_df.head()


Unnamed: 0,station,name,date,avg_daily_wind_speed,peak_gust_time,precipitation,snow,snow_depth,avg_hourly_temp,max_hourly_temp,min_hourly_temp,daily_sun_hours,fastest_2_min_wind_dir,fastest_5_sec_wind_dir,fastest_2_min_wind_speed,fastest_5_sec_wind_speed
0,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-01,12.75,,0.0,0.0,0.0,41,43,34,,270,280.0,25.9,35.1
1,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-02,9.4,,0.0,0.0,0.0,36,42,30,,260,260.0,21.0,25.1
2,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-03,10.29,,0.0,0.0,0.0,37,47,28,,270,250.0,23.9,30.0
3,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-04,17.22,,0.0,0.0,0.0,32,35,14,,330,330.0,25.9,33.1
4,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-05,9.84,,0.0,0.0,0.0,19,31,10,,360,350.0,25.1,31.1


In [98]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   station                   366 non-null    object 
 1   name                      366 non-null    object 
 2   date                      366 non-null    object 
 3   avg_daily_wind_speed      366 non-null    float64
 4   peak_gust_time            0 non-null      float64
 5   precipitation             366 non-null    float64
 6   snow                      366 non-null    float64
 7   snow_depth                366 non-null    float64
 8   avg_hourly_temp           366 non-null    int64  
 9   max_hourly_temp           366 non-null    int64  
 10  min_hourly_temp           366 non-null    int64  
 11  daily_sun_hours           0 non-null      float64
 12  fastest_2_min_wind_dir    366 non-null    int64  
 13  fastest_5_sec_wind_dir    364 non-null    float64
 14  fastest_2_

In [99]:
weather_df.isna().sum()

station                       0
name                          0
date                          0
avg_daily_wind_speed          0
peak_gust_time              366
precipitation                 0
snow                          0
snow_depth                    0
avg_hourly_temp               0
max_hourly_temp               0
min_hourly_temp               0
daily_sun_hours             366
fastest_2_min_wind_dir        0
fastest_5_sec_wind_dir        2
fastest_2_min_wind_speed      0
fastest_5_sec_wind_speed      2
dtype: int64

In [100]:
weather_df = weather_df.drop('peak_gust_time', axis=1)
weather_df = weather_df.drop('daily_sun_hours', axis=1)

In [101]:
weather_df.isna().sum()

station                     0
name                        0
date                        0
avg_daily_wind_speed        0
precipitation               0
snow                        0
snow_depth                  0
avg_hourly_temp             0
max_hourly_temp             0
min_hourly_temp             0
fastest_2_min_wind_dir      0
fastest_5_sec_wind_dir      2
fastest_2_min_wind_speed    0
fastest_5_sec_wind_speed    2
dtype: int64

In [102]:
weather_df['fastest_5_sec_wind_dir'] = weather_df['fastest_5_sec_wind_dir'].fillna(0)
weather_df['fastest_5_sec_wind_speed'] = weather_df['fastest_5_sec_wind_speed'].fillna(0)

In [103]:
weather_df.isna().sum()

station                     0
name                        0
date                        0
avg_daily_wind_speed        0
precipitation               0
snow                        0
snow_depth                  0
avg_hourly_temp             0
max_hourly_temp             0
min_hourly_temp             0
fastest_2_min_wind_dir      0
fastest_5_sec_wind_dir      0
fastest_2_min_wind_speed    0
fastest_5_sec_wind_speed    0
dtype: int64

In [104]:
weather_df['date'] = pd.to_datetime(weather_df['date'])

In [105]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   station                   366 non-null    object        
 1   name                      366 non-null    object        
 2   date                      366 non-null    datetime64[ns]
 3   avg_daily_wind_speed      366 non-null    float64       
 4   precipitation             366 non-null    float64       
 5   snow                      366 non-null    float64       
 6   snow_depth                366 non-null    float64       
 7   avg_hourly_temp           366 non-null    int64         
 8   max_hourly_temp           366 non-null    int64         
 9   min_hourly_temp           366 non-null    int64         
 10  fastest_2_min_wind_dir    366 non-null    int64         
 11  fastest_5_sec_wind_dir    366 non-null    float64       
 12  fastest_2_min_wind_spe

In [106]:
weather_stations_df = weather_df.loc[:,['station', 'name']].drop_duplicates()
weather_stations_df.head()


Unnamed: 0,station,name
0,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US"


### Create DB schema

In [None]:
def create_table_ddl(df, table_name):
        dtype_mapping = {
        "object": "TEXT",
        "int64": "BIGINT",
        "float64": "DOUBLE PRECISION",
        "bool": "BOOLEAN",
        "datetime64[ns]": "TIMESTAMP"
        }
        
        columns = []
        for col, dtype in df.dtypes.items():
            pg_type = dtype_mapping[str(dtype)]
            columns.append(f'{col} {pg_type}')
        
        create_table_sql = f'CREATE TABLE IF NOT EXISTS {table_name} ({",".join(columns)})'
        print(f'{create_table_sql} \n')
        return 

In [109]:
create_citibike_info = create_table_ddl(citi_bike_df, 'dwh.citibike_info')
create_bike_stations = create_table_ddl(bike_stations_df, 'dwh.bike_stations')
create_weather_info = create_table_ddl(weather_df, 'dwh.weater_info')
create_weather_stations = create_table_ddl(weather_stations_df, 'dwh.weather_stations')

trip_duration int64 BIGINT
start_time datetime64[ns] TIMESTAMP
stop_time datetime64[ns] TIMESTAMP
start_station_id int64 BIGINT
end_station_id int64 BIGINT
bike_id int64 BIGINT
user_type object TEXT
birth_year int64 BIGINT
gender object TEXT
station_id int64 BIGINT
station_name object TEXT
station_latitude float64 DOUBLE PRECISION
station_longitude float64 DOUBLE PRECISION
station object TEXT
name object TEXT
date datetime64[ns] TIMESTAMP
avg_daily_wind_speed float64 DOUBLE PRECISION
precipitation float64 DOUBLE PRECISION
snow float64 DOUBLE PRECISION
snow_depth float64 DOUBLE PRECISION
avg_hourly_temp int64 BIGINT
max_hourly_temp int64 BIGINT
min_hourly_temp int64 BIGINT
fastest_2_min_wind_dir int64 BIGINT
fastest_5_sec_wind_dir float64 DOUBLE PRECISION
fastest_2_min_wind_speed float64 DOUBLE PRECISION
fastest_5_sec_wind_speed float64 DOUBLE PRECISION
station object TEXT
name object TEXT
