In [0]:
# Import libraries

import helpr
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [0]:
def feature_engg(df):
    
    # Remove irrelevant features
    
    df = df.drop(columns = ['start_date', 'start_station_code', 'end_date', 'end_station_code', 'duration_sec', 'is_member', 'month'])
    
    # Count bixi usage/number of trips based on relevant features - this is what we'll try to predict
    
    df['trip_count'] = df.groupby(['year', 'week_num', 'day_of_week', 'hour'], as_index=False).transform('count')['weather']
    df = df.groupby(['year', 'week_num', 'day_of_week', 'hour'], as_index=False).agg('first')
        
    # Transform weather decription categories using one hot encoding
    
    enc = OneHotEncoder()
    encoder = enc.fit_transform(df.weather.values.reshape(-1, 1)).toarray()
    df_enc = pd.DataFrame(encoder, columns = ["weather_"+str(int(i)) for i in range(encoder.shape[1])])
    df = pd.concat([df.drop(columns = ['weather']), df_enc], axis=1)
    
    print('Feature engineering complete.')
    
    return df

In [0]:
def train_test_splitting(df, test_year):
    
    # Split dataframe to train and test
    
    df_train = df[df['year'] != test_year]
    df_test = df[df['year'] == test_year]
    
    df_train.drop(columns = ['year'])
    df_test.drop(columns = ['year'])
    
    # Split dataframe to X and y    
    
    y_train = df_train.trip_count.values
    X_train = df_train.drop(['trip_count'], axis=1)
    
    y_test = df_test.trip_count.values
    X_test = df_test.drop(['trip_count'], axis=1)
    
    print('Train-test splitting complete.')
    
    return X_train, X_test, y_train, y_test

In [0]:
#Using MinMaxScaler: It scales and translates each feature individually
#such that it is in the given range on the training set, e.g. between zero and one.
#It does not perform well in the presence of outliers.
#X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#X_scaled = X_std * (max - min) + min
    

def feature_norm(X_train, X_test, y_train, y_test):    
    
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    
    scaler_X.fit_transform(X_train)
    X_train = scaler_X.transform(X_train)
    X_test = scaler_X.transform(X_test)
    
    scaler_y.fit_transform(y_train.reshape(-1, 1))
    y_train = scaler_y.transform(y_train.reshape(-1, 1))
    y_test = scaler_y.transform(y_test.reshape(-1, 1))
    
    print('Feature normalization complete.')
    
    return X_train, X_test, y_train, y_test, scaler_y