This code create features from weather and taxi data (saved as pickle files) for a month (note: change the month name in the code.)
Both the weather and taxi data are sorted by timestamp, binned in equal time intervals  (3 hour)and concatenated in the same
dataframe. The dataframes for each month are stored as pickle objects.

In [148]:
import pandas as pd
import numpy as np
import pickle
import glob
from datetime import datetime as dt

In [149]:
output_path = "../Data/"
output_file = "december.p"

#### Taxi Data

In [150]:
tpath = "../../Taxi/CleanData/"
december_files = glob.glob(tpath+'*december*')

t_shape0 = []

t_feature_df = pd.DataFrame()

for tfile in december_files:
    with open(tfile, 'rb') as picklefile: 
        t_df = pickle.load(picklefile)
        
    t_df['trip_count'] = 1
    t_df = t_df[['trip_datetime','trip_count']]    
    t_df.index = t_df['trip_datetime']
    t_df_binned = t_df.resample('3H').sum().reset_index()
    
    hour_tag = []
    wkday_tag = []
    datetime_list = t_df_binned['trip_datetime'].tolist()
    for dt in datetime_list:
        hour_tag.append(dt.hour)
        wkday_tag.append(dt.weekday())
    t_df_binned['hour_tag'] = hour_tag
    t_df_binned['wkday_tag'] = wkday_tag
    
    t_shape0.append(t_df_binned.shape[0])
    
    t_feature_df = pd.concat([t_feature_df, t_df_binned], axis=0)
        
t_feature_df.shape   

(723, 4)

In [151]:
t_feature_df.head(10)

Unnamed: 0,trip_datetime,trip_count,hour_tag,wkday_tag
0,2014-12-01 00:00:00,3229,0,0
1,2014-12-01 03:00:00,1474,3,0
2,2014-12-01 06:00:00,5803,6,0
3,2014-12-01 09:00:00,8736,9,0
4,2014-12-01 12:00:00,7927,12,0
5,2014-12-01 15:00:00,9286,15,0
6,2014-12-01 18:00:00,12018,18,0
7,2014-12-01 21:00:00,7376,21,0
8,2014-12-02 00:00:00,2558,0,1
9,2014-12-02 03:00:00,845,3,1


#### Weather Data

In [152]:
# define data path
wpath = "../../Weather/ScrapedData/"
december_files = glob.glob(wpath+'*december*')

In [153]:
all_condition = []
for wfile in december_files:
    with open(wfile, 'rb') as picklefile: 
        w_df = pickle.load(picklefile)
        all_condition = all_condition + w_df.condition.tolist()

all_condition = list(set(all_condition))

# create condition column names
dummy_column_names = []
for i in range(0, len(all_condition)):
    dummy_column_names.append("c"+str(i))

In [154]:
t_id = 0

w_feature_df = pd.DataFrame()

for wfile in december_files:
    with open(wfile, 'rb') as picklefile: 
        w_df = pickle.load(picklefile)
        
    # separate numeric and dummy columns
    df_num = w_df[['date_time', 'temp', 'dew', 'humidity', 'pressure',
               'visibility', 'wind_speed', 'gust', 'precipitation']]
    
    df_dummy = pd.get_dummies(w_df.condition)
    for c in all_condition:
        if c not in df_dummy:
            df_dummy[c] = 0
    df_dummy.columns = dummy_column_names
    df_dummy['date_time'] = w_df['date_time'] 
    
    
    #bin df_num and df_num and merge
    df_num.index = df_num['date_time']
    df_num_binned = df_num.resample('3H').mean().reset_index()
    
    df_dummy.index = df_dummy['date_time']
    df_dummy_binned = df_dummy.resample('3H').sum().reset_index()
    
    extra = df_num_binned.shape[0] - t_shape0[t_id]
    t_id = t_id+1
    
    df_num_binned = df_num_binned.iloc[0:df_num_binned.shape[0]-extra, :]
    df_dummy_binned = df_dummy_binned.iloc[0:df_dummy_binned.shape[0]-extra, :]
    del df_dummy_binned['date_time']
    
    merged_df = pd.concat([df_num_binned, df_dummy_binned], axis=1)
    
    w_feature_df = pd.concat([w_feature_df, merged_df], axis=0)  

In [155]:
all_features = pd.concat([w_feature_df, t_feature_df], axis=1)
all_features = all_features.drop(['trip_datetime'], 1)

In [156]:
all_features.columns

Index(['date_time', 'temp', 'dew', 'humidity', 'pressure', 'visibility',
       'wind_speed', 'gust', 'precipitation', 'c0', 'c1', 'c2', 'c3', 'c4',
       'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15',
       'c16', 'c17', 'c18', 'c19', 'trip_count', 'hour_tag', 'wkday_tag'],
      dtype='object')

In [157]:
with open(output_path+output_file, 'wb') as picklefile:
                pickle.dump(all_features, picklefile)