In [1]:
import glob
import os

In [2]:
from math import cos, asin, sqrt, pi

#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer

def get_processed_df(df, selection_columns=\
    ['time_zone','stay_duration',
     'wifi_count', 'edge_wifi_count', 'honk_duration',
     'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
     'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
     'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
     'Is_Congestion', 'Is_Adhoc']):

    #Normalizing edge values with edge distance
    df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

    df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
    df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

    df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
    df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

    df['honk_duration']=df.honk_duration.apply(float)
    df['d_honk_duration']=df.d_honk_duration.apply(float)

    #calculating timeZone
    df['time_zone']=df.start_time.apply(time_zone_cal)

    #Rebuilding meaning full features from sparse features
    df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
    df['road_exist_percent']=df.high_way+df.two_way+df.one_way

    #New dataFrame is returned
    new_df=df[selection_columns].copy()
    
    return new_df

def transform_categorical_features(data, one_hot_encoder, label_encoders, categorical_features, for_train=False):
    
    if len(label_encoders) != len(categorical_features):
        raise ValueError("Number of Label Encoders must be equal to number of categorical features.")
    
    #converting categorical features in to integer encoding....
    for le, feature in zip(label_encoders, categorical_features):
        data[:, feature] = le.transform(data[:, feature])

    # OneHot Encoding Categorical features
    if for_train:
        data = one_hot_encoder.fit_transform(data)
    else:
        data = one_hot_encoder.transform(data)

    return data

def get_label_encoder(data, feature):
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    return le

def get_one_hot_encoder(data, categorical_features):
    encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
    encoder.fit(data)
    return encoder

def get_labels_for(poi_column, df):
    labels=df[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels


convert_to_class_names = lambda x, class_names: class_names[x]




# Codes from down

In [4]:
feature_names=['time_zone','stay_duration','wifi_count', 'edge_wifi_count', 'honk_duration',\
               'edge_honk_duration', 'd_wifi_count','d_edge_wifi_count','d_honk_duration',\
               'd_edge_honk_duration', 'human_made','natural_land','road_exist_percent',\
               'highly_populated_poi_exist', 'RSI']

categorical_features=[0,13]

In [5]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"
test_df_name = r"54ft_test.csv"

In [6]:
poi_column = 'Is_Congestion'

In [7]:
train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
train_df = get_processed_df(train_csv_df)

In [8]:
train_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Afternoon,6,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.111232,0.636805,0.251963,No,0.0,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
2,Afternoon,130,2,0.0278,5.0,0.0,4,0.018533,5.0,0.0,0.10036,0.743668,0.155972,No,2.422751,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc
3,Afternoon,13,0,0.0,2.0,0.0,0,0.0,2.0,0.0,0.100911,0.730013,0.169076,No,3.04042,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
4,Afternoon,9,0,0.002818,2.0,0.005635,0,0.002818,2.0,0.005635,0.117394,0.487688,0.183064,Yes,1.78086,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc


In [9]:
train_df[poi_column].value_counts()

Not Congestion    2246
Congestion         234
Name: Is_Congestion, dtype: int64

In [10]:
X = train_df[feature_names].copy()
y = train_df[poi_column].copy()

# Using Resample

In [11]:
from sklearn.utils import resample

In [12]:
resampling_df = pd.concat([X, y], axis=1)
resampling_df

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Congestion
0,Afternoon,6,0,0.000000,0.0,0.000000,0,0.000000,0.0,0.000000,0.111232,0.636805,0.251963,No,0.000000,Not Congestion
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045,Not Congestion
2,Afternoon,130,2,0.027800,5.0,0.000000,4,0.018533,5.0,0.000000,0.100360,0.743668,0.155972,No,2.422751,Not Congestion
3,Afternoon,13,0,0.000000,2.0,0.000000,0,0.000000,2.0,0.000000,0.100911,0.730013,0.169076,No,3.040420,Not Congestion
4,Afternoon,9,0,0.002818,2.0,0.005635,0,0.002818,2.0,0.005635,0.117394,0.487688,0.183064,Yes,1.780860,Not Congestion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475,Afternoon,9,0,0.000000,1.0,0.094916,0,0.000000,3.0,0.088136,0.068135,0.726080,0.140262,Yes,1.352558,Congestion
2476,Afternoon,6,0,0.009991,5.0,0.129889,0,0.009991,8.0,0.099914,0.093859,0.724056,0.123278,Yes,1.946738,Congestion
2477,Afternoon,30,4,0.006092,12.0,0.085291,5,0.004061,15.0,0.079199,0.070985,0.781468,0.147548,No,1.879751,Not Congestion
2478,Afternoon,5,2,0.008167,8.0,0.077588,3,0.004084,12.0,0.061253,0.130413,0.720888,0.148698,No,1.813848,Congestion


In [13]:
negative = resampling_df[resampling_df.Is_Congestion=="Not Congestion"]
positive = resampling_df[resampling_df.Is_Congestion=="Congestion"]

## Up sampling

In [14]:
pos_up_sampled = resample(positive, replace=True, n_samples=len(negative))

In [15]:
upsampled = pd.concat([negative, pos_up_sampled])
upsampled.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Congestion
0,Afternoon,6,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.111232,0.636805,0.251963,No,0.0,Not Congestion
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045,Not Congestion
2,Afternoon,130,2,0.0278,5.0,0.0,4,0.018533,5.0,0.0,0.10036,0.743668,0.155972,No,2.422751,Not Congestion
3,Afternoon,13,0,0.0,2.0,0.0,0,0.0,2.0,0.0,0.100911,0.730013,0.169076,No,3.04042,Not Congestion
4,Afternoon,9,0,0.002818,2.0,0.005635,0,0.002818,2.0,0.005635,0.117394,0.487688,0.183064,Yes,1.78086,Not Congestion


In [16]:
upsampled[poi_column].value_counts()

Not Congestion    2246
Congestion        2246
Name: Is_Congestion, dtype: int64

## Down Sampling

In [17]:
neg_down_sampled = resample(negative, replace=True, n_samples=len(positive))

In [18]:
downsampled = pd.concat([positive, neg_down_sampled])
downsampled.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Congestion
29,Afternoon,11,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.101114,0.619321,0.279565,No,1.556189,Congestion
396,Early_Morning,22,0,0.004701,1.0,0.021156,0,0.004701,2.0,0.01998,0.079549,0.769076,0.151375,No,1.641501,Congestion
454,Morning,21,1,0.005885,10.0,0.132404,1,0.005885,12.0,0.126519,0.122956,0.691,0.160412,Yes,1.738751,Congestion
739,Morning,14,1,0.003415,1.0,0.011383,2,0.002277,1.0,0.011383,0.073264,0.774434,0.152302,No,1.4183,Congestion
779,Afternoon,350,16,0.0,25.0,0.0,16,0.0,25.0,0.0,0.168338,0.670192,0.16147,No,2.942144,Congestion


In [19]:
downsampled[poi_column].value_counts()

Not Congestion    234
Congestion        234
Name: Is_Congestion, dtype: int64

# Using SMOTE

In [20]:
from imblearn.over_sampling import SMOTENC # using SMOTENC since there is categorical values

In [21]:
smote = SMOTENC(categorical_features=categorical_features)
smote

SMOTENC(categorical_features=[0, 13])

In [22]:
X_resampled, y_resampled = smote.fit_sample(X, y)
print(X.shape)
print(y.shape)
print(X_resampled.shape)
print(y_resampled.shape)

(2480, 15)
(2480,)
(4492, 15)
(4492,)


In [23]:
y.value_counts()

Not Congestion    2246
Congestion         234
Name: Is_Congestion, dtype: int64

In [24]:
y_resampled.value_counts()

Not Congestion    2246
Congestion        2246
Name: Is_Congestion, dtype: int64

In [25]:
1/len(y.value_counts())

0.5

In [26]:
y.value_counts().values

array([2246,  234], dtype=int64)

In [35]:
def is_imbalanced(series, verbose=False):
    percent = (list(series.value_counts())[0]/series.shape[0])
    limit = 1/len(series.value_counts())
    margin = 0.05
    if verbose:
        print(f"Percent: {percent*100}\nLimit: {limit*100}\nUpper Max: {(limit+margin)*100}\nLower Min: {(limit-margin)*100}")
        print(f"Highest: {series.value_counts().keys()[0]} = {list(series.value_counts())[0]}")
        print(f"Lowest: {series.value_counts().keys()[-1]} = {list(series.value_counts())[-1]}")
    if percent > limit+margin or percent < limit - margin:
        return True
    else:
        return False
        

In [36]:
is_imbalanced(y, verbose=True)

Percent: 90.56451612903226
Limit: 50.0
Upper Max: 55.00000000000001
Lower Min: 45.0
Highest: Not Congestion = 2246
Lowest: Congestion = 234


True

In [37]:
is_imbalanced(y_resampled, verbose=True)

Percent: 50.0
Limit: 50.0
Upper Max: 55.00000000000001
Lower Min: 45.0
Highest: Not Congestion = 2246
Lowest: Congestion = 2246


False

In [38]:
sr = pd.Series([0, 0, 1, 2, 3])
is_imbalanced(sr, verbose=True)

Percent: 40.0
Limit: 25.0
Upper Max: 30.0
Lower Min: 20.0
Highest: 0 = 2
Lowest: 1 = 1


True

In [39]:
sr = pd.Series([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5])
is_imbalanced(sr, verbose=True)

Percent: 17.647058823529413
Limit: 16.666666666666664
Upper Max: 21.666666666666668
Lower Min: 11.666666666666666
Highest: 5 = 3
Lowest: 3 = 2


False

In [40]:
sr = pd.Series([1, 2, 3, 2, 4, 5, 6, 7, 8, 8, 9, 9, 1, 3, 5, 5, 5, 5, 8])
is_imbalanced(sr, verbose=True)

Percent: 26.31578947368421
Limit: 11.11111111111111
Upper Max: 16.11111111111111
Lower Min: 6.11111111111111
Highest: 5 = 5
Lowest: 4 = 1


True

In [41]:
is_imbalanced(X.iloc[:, 0], verbose=True)

Percent: 33.66935483870967
Limit: 25.0
Upper Max: 30.0
Lower Min: 20.0
Highest: Morning = 835
Lowest: Evening = 379


True

In [42]:
is_imbalanced(X.iloc[:, 13], verbose=True)

Percent: 52.98387096774193
Limit: 50.0
Upper Max: 55.00000000000001
Lower Min: 45.0
Highest: Yes = 1314
Lowest: No = 1166


False