In [1]:
import pandas as pd 
import numpy as np

In [2]:
weather_data = pd.read_csv('./data/weather_data3.csv')
bike_data = pd.read_csv('./data/combined_csv.csv')

In [3]:
weather_data.head()

Unnamed: 0,date,time,weather_description,temp_in_f,dewpt_in_f,humidity_in_%,pressure,precipitation_in_inches,visibility_in_miles,wind_direction,wind_speed_in_mph
0,2018-01-01,00:00:00,Clear,12°F,5°F,73%,30.39,0 in.,10mi.,SSE,3.0
1,2018-01-01,01:00:00,Clear,10°F,5°F,79%,30.39,0 in.,10mi.,S,3.0
2,2018-01-01,02:00:00,Clear,10°F,3°F,72%,30.39,0 in.,10mi.,N,0.0
3,2018-01-01,03:00:00,Clear,14°F,5°F,67%,30.39,0 in.,10mi.,W,7.0
4,2018-01-01,04:00:00,Clear,14°F,3°F,62%,30.42,0 in.,10mi.,WNW,7.0


In [4]:
bike_data.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,552,2018-01-01 00:05:06,2018-01-01 00:14:18,31104,Adams Mill & Columbia Rd NW,31400,Georgia & New Hampshire Ave NW,W00886,Member
1,1282,2018-01-01 00:14:30,2018-01-01 00:35:53,31321,15th St & Constitution Ave NW,31321,15th St & Constitution Ave NW,W01435,Casual
2,1265,2018-01-01 00:14:53,2018-01-01 00:35:58,31321,15th St & Constitution Ave NW,31321,15th St & Constitution Ave NW,W21242,Casual
3,578,2018-01-01 00:15:31,2018-01-01 00:25:09,31406,14th & Upshur St NW,31103,16th & Harvard St NW,W21322,Casual
4,372,2018-01-01 00:18:02,2018-01-01 00:24:15,31618,4th & East Capitol St NE,31619,Lincoln Park / 13th & East Capitol St NE,W00119,Member


## Cleaning the data

In [5]:
## Removing unwanted characters such as %, F etc (i.e units)
weather_data['temp_in_f'] = weather_data['temp_in_f'].str.extract('(\d+)', expand=False)
weather_data['dewpt_in_f'] = weather_data['dewpt_in_f'].str.extract('(\d+)', expand=False)
weather_data['humidity_in_%'] = weather_data['humidity_in_%'].str.extract('(\d+)', expand=False)
weather_data['visibility_in_miles'] = weather_data['visibility_in_miles'].str.extract('(\d+)', expand=False)
weather_data['precipitation_in_inches'] = weather_data['precipitation_in_inches'].str.extract('(\d+)', expand=False)

In [6]:
## convert to datetime 
bike_data['Start date'] = pd.to_datetime(bike_data['Start date'])
bike_data['End date'] = pd.to_datetime(bike_data['End date'])

In [7]:
## Separating date and time
bike_data['start_date'] = bike_data['Start date'].dt.date
bike_data['end_date'] = bike_data['End date'].dt.date
bike_data['start_time'] = bike_data['Start date'].dt.time
bike_data['end_time'] = bike_data['End date'].dt.time

In [8]:
# get duration in minutes 
bike_data['duration_in_mins'] = (bike_data['End date'] - bike_data['Start date']) / np.timedelta64(1,'m') 

In [9]:
bike_data = bike_data.rename(columns={'Start station number': 'start_station_id', 'Start station': 'start_station', 
                         'End station number': 'end_station_id', 'End station':'end_station', 
                         'Bike number': 'bike_id', 'Member type': 'member_type'})

## Sanity checks 

In [10]:
# checking number of nulls 
bike_data.isna().sum() 

Duration            0
Start date          0
End date            0
start_station_id    0
start_station       0
end_station_id      0
end_station         0
bike_id             0
member_type         0
start_date          0
end_date            0
start_time          0
end_time            0
duration_in_mins    0
dtype: int64

## Merge weather data and bike data

In [11]:
# get nearest hour
bike_data['nearest_date'] = bike_data['Start date'].dt.round(freq = 'H')
bike_data['date'] = bike_data['nearest_date'].dt.date
bike_data['date'] = bike_data['date'].apply(lambda x:x.strftime('%Y-%m-%d'))
bike_data['time'] = bike_data['nearest_date'].dt.time
bike_data['time'] = bike_data['time'].apply(lambda x:x.strftime('%H:%M:%S'))

In [12]:
weather_data['time'] = weather_data['time'].apply(lambda x: x.strip())
weather_data['date'] = weather_data['date'].apply(lambda x: x.strip())

In [13]:
# merge
final_data = pd.merge(bike_data, weather_data, on=['date', 'time'], how='inner')

In [14]:
# drop columns 
final_data = final_data.drop(['nearest_date', 'Duration', 'Start date', 'End date', 
                           'date', 'time'], axis = 1)

In [15]:
bike_data.shape

(3542684, 17)

In [16]:
final_data.shape

(3542233, 20)

In [17]:
# checking number of nulls 
final_data.isna().sum() 

start_station_id              0
start_station                 0
end_station_id                0
end_station                   0
bike_id                       0
member_type                   0
start_date                    0
end_date                      0
start_time                    0
end_time                      0
duration_in_mins              0
weather_description        1896
temp_in_f                  1048
dewpt_in_f                 1048
humidity_in_%              1048
pressure                      0
precipitation_in_inches       0
visibility_in_miles        9750
wind_direction                0
wind_speed_in_mph          1452
dtype: int64

In [28]:
final_data['start_date'][final_data['temp_in_f'].isna()].value_counts()

2018-08-14    598
2018-04-16    266
2018-11-15    117
2018-11-16     67
Name: start_date, dtype: int64

In [20]:
final_data.tail()

Unnamed: 0,start_station_id,start_station,end_station_id,end_station,bike_id,member_type,start_date,end_date,start_time,end_time,duration_in_mins,weather_description,temp_in_f,dewpt_in_f,humidity_in_%,pressure,precipitation_in_inches,visibility_in_miles,wind_direction,wind_speed_in_mph
3542228,31045,Commerce St & Fayette St,31041,Prince St & Union St,W00399,Casual,2018-12-31,2018-12-31,23:28:16,23:43:22,15.1,Partly Cloudy,59,54,82,29.83,0,10,SSW,8.0
3542229,31041,Prince St & Union St,31903,Royal & Wilkes St,W00490,Member,2018-12-31,2018-12-31,23:28:16,23:32:50,4.566667,Partly Cloudy,59,54,82,29.83,0,10,SSW,8.0
3542230,31045,Commerce St & Fayette St,31041,Prince St & Union St,W22125,Member,2018-12-31,2018-12-31,23:28:23,23:43:32,15.15,Partly Cloudy,59,54,82,29.83,0,10,SSW,8.0
3542231,31602,Park Rd & Holmead Pl NW,31291,Vermont Ave & I St NW,W23745,Member,2018-12-31,2018-12-31,23:28:50,23:50:09,21.316667,Partly Cloudy,59,54,82,29.83,0,10,SSW,8.0
3542232,31125,15th & W St NW,31200,Massachusetts Ave & Dupont Circle NW,W00436,Member,2018-12-31,2018-12-31,23:29:55,23:36:24,6.483333,Partly Cloudy,59,54,82,29.83,0,10,SSW,8.0


In [19]:
# write out data file 
final_data.to_csv('./data/data.csv', index=False)