In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('train_set.csv')
validation = pd.read_csv('val_set.csv')
test = pd.read_csv('test_set.csv')

In [4]:
def feature_extractor(df, fitted_scaler=None):
    df['Start_Time'] = pd.to_datetime(df['Start_Time'])
    tf_mapper = lambda val: int(val)
    block_in = lambda string: 'block' in string 
    block_mapper = lambda desc: tf_mapper(block_in(str(desc)))
    # Feature engineering 
    temp = df['temp.imp.zip']
    humidity = df['humid.imp.zip']
    pressure = df['pressure.imp.zip']
    visibility = df['visibility.imp.zip']
    wind_speed = df['wind.speed.imp.zip']
    duration = df['Duration']
    distance = df['Distance.mi']
    is_blocked = df['Description'].map(block_mapper)
    light = df['light'].map(tf_mapper)
    heavy = df['heavy'].map(tf_mapper)
    rain = df['rain'].map(tf_mapper)
    snow = df['snow'].map(tf_mapper)
    storm = df['storm'].map(tf_mapper)
    haze = df['haze'].map(tf_mapper)
    is_weekday = df['Start_Time'].dt.weekday < 5
    is_rush_hour = pd.to_datetime(df['Start_Time']).dt.hour.between(7, 9, inclusive=True) | pd.to_datetime(df['Start_Time']).dt.hour.between(16, 18, inclusive=True)
    season = pd.get_dummies((df['Start_Time'].dt.month%12 + 3)//3, drop_first=True)
    state = pd.get_dummies(df['State'], drop_first=True, prefix='State')
    amenity = df['Amenity'].map(tf_mapper)
    bump = df['Bump'].map(tf_mapper)
    crossing = df['Crossing'].map(tf_mapper)
    give_way = df['Give_Way'].map(tf_mapper)
    junction = df['Junction'].map(tf_mapper)
    no_exit = df['No_Exit'].map(tf_mapper)
    railway = df['Railway'].map(tf_mapper)
    roundabout = df['Roundabout'].map(tf_mapper)
    station = df['Station'].map(tf_mapper)
    stop = df['Stop'].map(tf_mapper)
    traffic_calming = df['Traffic_Calming'].map(tf_mapper)
    traffic_signal = df['Traffic_Signal'].map(tf_mapper)
    turning_loop = df['Turning_Loop'].map(tf_mapper)
    sunrise_sunset = pd.get_dummies(df['Sunrise_Sunset'], drop_first=True)
    source = pd.get_dummies(df['Source'], drop_first=True)
    inter_df = pd.DataFrame({'Temperature':temp,
                           'Humidity':humidity,
                           'Pressure':pressure,
                           'Visibility':visibility,
                           'Wind Speed':wind_speed,
                           'Duration':duration,
                           'Distance':distance,
                           'Blocked':is_blocked,
                           'Light':light,
                           'Heavy':heavy,
                           'Rain':rain,
                           'Snow':snow,
                           'Weekday':is_weekday,
                           'Rush Hour':is_rush_hour,
                           'Amenity':amenity,
                           'Bump':bump,
                           'Crossing':crossing,
                           'Giveway':give_way,
                           'Junction':junction,
                           'No Exit':no_exit,
                           'Railway':railway,
                           'Roundabout':roundabout,
                           'Station':station,
                           'Stop':stop,
                           'Traffic Calming':traffic_calming,
                           'Traffic Signal':traffic_signal})
    final_df = pd.concat([inter_df, season, state, sunrise_sunset, source], axis = 1)
    if 'ID' in df.columns:
        ID = df['ID']
        final_df.insert(loc=0, column='ID', value=ID)
    else:
        severity = df['Severity']
        final_df['Severity'] = severity
    if not fitted_scaler:
        fitted_scaler = StandardScaler()
        numer_cols = fitted_scaler.fit_transform(final_df.loc[:, 'Temperature':'Distance'])
        categ_cols = final_df.loc[:, 'Blocked':].values
        scaled_final = pd.DataFrame(np.hstack([numer_cols, categ_cols]))
        scaled_final.columns = final_df.columns
        return scaled_final, fitted_scaler
    else:
        numer_cols = fitted_scaler.transform(final_df.loc[:, 'Temperature':'Distance'])
        categ_cols = final_df.loc[:, 'Blocked':].values
        scaled_final = pd.DataFrame(np.hstack([numer_cols, categ_cols]))
        if 'ID' in df.columns:
            scaled_final.insert(loc=0, column='ID', value=ID)
        scaled_final.columns = final_df.columns
        return scaled_final, None

In [5]:
final_train, fitted_scaler = feature_extractor(train)

In [6]:
final_val, _ = feature_extractor(validation, fitted_scaler)

In [7]:
final_test, _ = feature_extractor(test, fitted_scaler)

In [8]:
final_train.head()

Unnamed: 0,Temperature,Humidity,Pressure,Visibility,Wind Speed,Duration,Distance,Blocked,Light,Heavy,...,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Night,MapQuest,MapQuest-Bing,Severity
0,-1.86034,-0.554951,0.850351,0.315661,-0.165462,-0.330986,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.781774,-2.22403,-4.66377,0.315661,-0.334964,-0.335636,0.145988,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.835849,1.50944,0.217206,-3.23707,-1.46497,-0.248127,-0.206765,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,-0.302034,-1.96049,0.274765,0.315661,1.56722,-0.330925,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-1.21329,1.50944,0.320812,-3.05943,-1.46497,0.163827,-0.197531,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
final_val.head()

Unnamed: 0,Temperature,Humidity,Pressure,Visibility,Wind Speed,Duration,Distance,Blocked,Light,Heavy,...,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Night,MapQuest,MapQuest-Bing,Severity
0,-1.42898,1.15805,-0.128146,0.315661,-0.899968,-0.303213,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.776382,0.850592,0.3093,0.315661,-0.165462,-0.330678,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.997457,-1.12595,0.297788,0.315661,0.795046,-0.334989,0.0210162,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.06755,-2.39973,0.251741,0.315661,0.267708,-0.331048,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-1.86034,0.543129,-0.588616,0.315661,-0.146629,-0.220847,-0.206765,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
final_test.head()

Unnamed: 0,ID,Temperature,Humidity,Pressure,Visibility,Wind Speed,Duration,Distance,Blocked,Light,...,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Night,MapQuest,MapQuest-Bing
0,A-1,-1.32653,1.11413,-0.0130287,0.315661,0.700878,0.194002,-0.200609,1,1,...,0,0,0,0,0,0,0,1,1,0
1,A-5,-1.37506,1.02628,-0.0475639,-1.10543,-0.805801,-0.330678,-0.200609,0,0,...,0,0,0,0,0,0,0,0,1,0
2,A-7,-1.4829,1.50944,-0.0360522,-0.750157,-0.805801,-0.330678,-0.206765,0,0,...,0,0,0,0,0,0,0,0,1,0
3,A-14,-1.37506,1.02628,-0.0475639,0.315661,-0.372631,-0.330678,-0.200609,0,0,...,0,0,0,0,0,0,0,0,1,0
4,A-22,-1.37506,1.02628,-0.0475639,0.315661,-0.165462,-0.330678,-0.206765,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
final_train.to_csv('train_final.csv', index=False)
final_val.to_csv('val_final.csv', index=False)
final_test.to_csv('test_final.csv', index=False)