# Preprocessing

In [1]:
import numpy as np
import pandas as pd
from math import cos, asin, sqrt, pi

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
def new_Processing_before_journal(location):
    #reading Data
    df=pd.read_csv(location)

    #Normalizing edge values with edge distance
    df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

    df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
    df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

    df['edge_honk_duration']=df.edge_honk_duration/df.next_hop_distance
    df['d_edge_honk_duration']=df.d_edge_honk_duration/df.next_hop_distance

    #calculating timeZone
    df['time_zone']=df.start_time.apply(time_zone_cal)
    df=pd.get_dummies(df, columns=["time_zone"], prefix=["Is"])

    #Rebuilding meaning full features from sparse features
    df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(np.ceil)
    df['road_exist_percent']=df.high_way+df.two_way+df.one_way

    #Processing labels
    df['Is_Bus_stop']=df.Is_Bus_stop.map({'Bus_stop':1,'Not Bus_stop':0})
    df['Is_Turn']=df.Is_Turn.map({'Turn':1,'Not Turn':0})
    df['Is_Signal']=df.Is_Signal.map({'Signal':1,'Not Signal':0})
    df['Is_Congestion']=df.Is_Congestion.map({'Congestion':1,'Not Congestion':0})
    df['Is_Adhoc']=df.Is_Adhoc.map({'Adhoc':1,'Not Adhoc':0})

    #Selected Columns
    features=\
    [
     'stay_duration', #f1
     'mfcc0','mfcc1','mfcc2','mfcc3','mfcc4', #f2,f3,f4,f5,f6
     'wifi_count', 'edge_wifi_count', #f7,f8
     'RSI', #f9
     'human_made', 'natural_land','road_exist_percent','highly_populated_poi_exist'#f10,f11,f12,f13
     ]

    labels=['Is_Bus_stop','Is_Turn', 'Is_Signal','Is_Congestion', 'Is_Adhoc']

    #New dataFrame is returned
    return df[['start_date','lat','long','start_time','end_time']+features+labels].copy()

In [4]:
df=new_Processing_before_journal("./Datasets/Processed_Bus_Trail_data_54F_with_mfcc_new.csv")#Raw data is processed

In [5]:
df.head()

Unnamed: 0,start_date,lat,long,start_time,end_time,stay_duration,mfcc0,mfcc1,mfcc2,mfcc3,...,RSI,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,07/01/2019,23.494324,87.317134,08:57:28,09:02:29,298,53.997597,-0.019574,-1.002552,-3.182711,...,0.0,0.168316,0.696344,0.13534,0.0,0,0,0,0,1
1,07/01/2019,23.493368,87.316452,09:02:39,09:05:38,175,89.707253,2.749793,0.087587,-2.804724,...,0.0,0.138661,0.676613,0.184726,0.0,1,1,0,0,0
2,07/01/2019,23.494326,87.315501,09:05:58,09:06:02,5,44.239017,40.802536,19.241919,2.581454,...,1.556096,0.164903,0.661698,0.1734,0.0,0,0,0,0,1
3,07/01/2019,23.495541,87.314797,09:06:27,09:06:31,5,61.249569,52.913704,52.183033,1.408868,...,1.541073,0.109366,0.770199,0.120435,0.0,0,0,0,0,1
4,07/01/2019,23.496357,87.31371,09:06:52,09:07:11,16,67.810631,18.415585,9.868664,-3.614725,...,1.420824,0.078238,0.772432,0.14933,0.0,1,0,0,0,0


In [6]:
df.to_csv('./Datasets/DataSet_54F_mfcc.csv',index=False)

# NICE