In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train_set.csv')
validation = pd.read_csv('val_set.csv')

In [3]:
def feature_extractor(df):
    df['Start_Time'] = pd.to_datetime(df['Start_Time'])
    tf_mapper = lambda val: int(val)
    block_mapper = lambda string: tf_mapper('block' in string)
    # Feature engineering 
    temp = df['temp.imp.zip']
    humidity = df['humid.imp.zip']
    pressure = df['pressure.imp.zip']
    visibility = df['visibility.imp.zip']
    wind_speed = df['wind.speed.imp.zip']
    duration = df['Duration']
    distance = df['Distance.mi']
    is_blocked = df['Description'].map(block_mapper)
    light = df['light'].map(tf_mapper)
    heavy = df['heavy'].map(tf_mapper)
    rain = df['rain'].map(tf_mapper)
    snow = df['snow'].map(tf_mapper)
    storm = df['storm'].map(tf_mapper)
    haze = df['haze'].map(tf_mapper)
    is_weekday = df['Start_Time'].dt.weekday < 5
    is_rush_hour = pd.to_datetime(df['Start_Time']).dt.hour.between(7, 9, inclusive=True) | pd.to_datetime(df['Start_Time']).dt.hour.between(16, 18, inclusive=True)
    season = pd.get_dummies((df['Start_Time'].dt.month%12 + 3)//3, drop_first=True)
    state = pd.get_dummies(df['State'], drop_first=True)
    amenity = df['Amenity'].map(tf_mapper)
    bump = df['Bump'].map(tf_mapper)
    crossing = df['Crossing'].map(tf_mapper)
    give_way = df['Give_Way'].map(tf_mapper)
    junction = df['Junction'].map(tf_mapper)
    no_exit = df['No_Exit'].map(tf_mapper)
    railway = df['Railway'].map(tf_mapper)
    roundabout = df['Roundabout'].map(tf_mapper)
    station = df['Station'].map(tf_mapper)
    stop = df['Stop'].map(tf_mapper)
    traffic_calming = df['Traffic_Calming'].map(tf_mapper)
    traffic_signal = df['Traffic_Signal'].map(tf_mapper)
    turning_loop = df['Turning_Loop'].map(tf_mapper)
    sunrise_sunset = pd.get_dummies(df['Sunrise_Sunset'], drop_first=True)
    source = pd.get_dummies(df['Source'], drop_first=True)
    severity = df['Severity']
    inter_df = pd.DataFrame({'Temperature':temp,
                           'Humidity':humidity,
                           'Pressure':pressure,
                           'Visibility':visibility,
                           'Wind Speed':wind_speed,
                           'Duration':duration,
                           'Distance':distance,
                           'Blocked':is_blocked,
                           'Light':light,
                           'Heavy':heavy,
                           'Rain':rain,
                           'Snow':snow,
                           'Weekday':is_weekday,
                           'Rush Hour':is_rush_hour,
                           'Amenity':amenity,
                           'Bump':bump,
                           'Crossing':crossing,
                           'Giveway':give_way,
                           'Junction':junction,
                           'No Exit':no_exit,
                           'Railway':railway,
                           'Roundabout':roundabout,
                           'Station':station,
                           'Stop':stop,
                           'Traffic Calming':traffic_calming,
                           'Traffic Signal':traffic_signal,
                           'Turning Loop':turning_loop})
    final_df = pd.concat([inter_df, season, state, sunrise_sunset, source], axis = 1)
    final_df['Severity'] = severity
    return final_df

In [4]:
final_train = feature_extractor(train)
final_val = feature_extractor(validation)

In [6]:
final_train.to_csv('train_final.csv',index=False)
final_val.to_csv('val_final.csv',index=False)