# Preprocessing

In [1]:
import numpy as np
import pandas as pd
#from imblearn.over_sampling import SMOTE
from math import cos, asin, sqrt, pi

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
#reading Data
df=pd.read_csv('Processed_Bus_Trail_data_54F.csv')

#Normalizing edge values with edge distance
df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

df['edge_honk_duration']=df.edge_honk_duration/df.next_hop_distance
df['d_edge_honk_duration']=df.d_edge_honk_duration/df.next_hop_distance

#calculating timeZone
df['time_zone']=df.start_time.apply(time_zone_cal)
df=pd.get_dummies(df, columns=["time_zone"], prefix=["Is"])

#Rebuilding meaning full features from sparse features
df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(np.ceil)
df['road_exist_percent']=df.high_way+df.two_way+df.one_way

#Selected Columns
columns=\
['Is_Early_Morning','Is_Morning','Is_Afternoon','Is_Evening','stay_duration',
 'wifi_count', 'edge_wifi_count', 'honk_duration',
 'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
 'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
 'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
 'Is_Congestion', 'Is_Adhoc']

#New dataFrame is returned
new_df=df[columns].copy()

In [4]:
#Processing labels
new_df['Is_Bus_stop']=new_df.Is_Bus_stop.map({'Bus_stop':1,'Not Bus_stop':0})
new_df['Is_Turn']=new_df.Is_Turn.map({'Turn':1,'Not Turn':0})
new_df['Is_Signal']=new_df.Is_Signal.map({'Signal':1,'Not Signal':0})
new_df['Is_Congestion']=new_df.Is_Congestion.map({'Congestion':1,'Not Congestion':0})
new_df['Is_Adhoc']=new_df.Is_Adhoc.map({'Adhoc':1,'Not Adhoc':0})

In [6]:
new_df.to_csv('DataSet_54F.csv',index=False)

In [6]:
#NICE