# Helper Functions and Imports

In [3]:
import glob
import os
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTENC # using SMOTENC since there is categorical values

In [4]:
from math import cos, asin, sqrt, pi

#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [5]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer

def get_processed_df(df, selection_columns=\
    ['time_zone','stay_duration',
     'wifi_count', 'edge_wifi_count', 'honk_duration',
     'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
     'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
     'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
     'Is_Congestion', 'Is_Adhoc']):

    #Normalizing edge values with edge distance
    df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

    df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
    df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

    df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
    df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

    df['honk_duration']=df.honk_duration.apply(float)
    df['d_honk_duration']=df.d_honk_duration.apply(float)

    #calculating timeZone
    df['time_zone']=df.start_time.apply(time_zone_cal)

    #Rebuilding meaning full features from sparse features
    df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
    df['road_exist_percent']=df.high_way+df.two_way+df.one_way

    #New dataFrame is returned
    new_df=df[selection_columns].copy()
    
    return new_df

def transform_categorical_features(data, one_hot_encoder, label_encoders, categorical_features, for_train=False):
    
    if len(label_encoders) != len(categorical_features):
        raise ValueError("Number of Label Encoders must be equal to number of categorical features.")
    
    #converting categorical features in to integer encoding....
    for le, feature in zip(label_encoders, categorical_features):
        data[:, feature] = le.transform(data[:, feature])

    # OneHot Encoding Categorical features
    if for_train:
        data = one_hot_encoder.fit_transform(data)
    else:
        data = one_hot_encoder.transform(data)

    return data

def get_label_encoder(data, feature):
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    return le

def get_one_hot_encoder(data, categorical_features):
    encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
    encoder.fit(data)
    return encoder

def get_labels_for(poi_column, df):
    labels=df[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels


convert_to_class_names = lambda x, class_names: class_names[x]


In [6]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

def get_performance_stats(test_or_train, true_values, predicted_values):
    print(f"For {test_or_train}: ")
    print(f"\tAccuracy: {accuracy_score(true_values, predicted_values)}")
    print(f"\tPrecision: {precision_score(true_values, predicted_values,average='weighted')}")
    print(f"\tRecall: {recall_score(true_values, predicted_values,average='weighted')}")
    print(f"\tF1 score: {f1_score(true_values, predicted_values,average='weighted')}")
    print(classification_report(true_values, predicted_values))

# Data Preprocessing

## Feature selection

In [7]:
feature_names=['time_zone','stay_duration','wifi_count', 'edge_wifi_count', 'honk_duration',\
               'edge_honk_duration', 'd_wifi_count','d_edge_wifi_count','d_honk_duration',\
               'd_edge_honk_duration', 'human_made','natural_land','road_exist_percent',\
               'highly_populated_poi_exist', 'RSI']

categorical_features=[0,13]

In [8]:
poi_column = 'Is_Congestion' # the column to be predicted

## Loading data

In [9]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"
test_df_name = r"54ft_test.csv"

In [10]:
feature_names=['time_zone','stay_duration','wifi_count', 'edge_wifi_count', 'honk_duration',\
               'edge_honk_duration', 'd_wifi_count','d_edge_wifi_count','d_honk_duration',\
               'd_edge_honk_duration', 'human_made','natural_land','road_exist_percent',\
               'highly_populated_poi_exist', 'RSI']

categorical_features=[0,13]

### Loading Train data

In [11]:
train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
train_df = get_processed_df(train_csv_df)

In [12]:
train_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Afternoon,6,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.111232,0.636805,0.251963,No,0.0,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
2,Afternoon,130,2,0.0278,5.0,0.0,4,0.018533,5.0,0.0,0.10036,0.743668,0.155972,No,2.422751,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc
3,Afternoon,13,0,0.0,2.0,0.0,0,0.0,2.0,0.0,0.100911,0.730013,0.169076,No,3.04042,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
4,Afternoon,9,0,0.002818,2.0,0.005635,0,0.002818,2.0,0.005635,0.117394,0.487688,0.183064,Yes,1.78086,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc


### Loading Test data

In [13]:
test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
test_df = get_processed_df(test_csv_df)

In [14]:
test_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Morning,105,7,0.0,69.0,0.0,7,0.0,69.0,0.0,0.122769,0.692291,0.182053,Yes,0.0,Bus_stop,Turn,Not Signal,Congestion,Not Adhoc
1,Morning,12,2,0.024291,9.0,0.083285,2,0.024291,11.0,0.076344,0.107162,0.766882,0.125955,No,1.289747,Not Bus_stop,Not Turn,Not Signal,Congestion,Not Adhoc
2,Morning,4,0,0.005824,0.0,0.049507,0,0.005824,1.0,0.048051,0.093122,0.709518,0.132148,Yes,1.302277,Not Bus_stop,Not Turn,Not Signal,Congestion,Not Adhoc
3,Morning,11,1,0.005213,1.0,0.049528,1,0.005213,1.0,0.049528,0.0639,0.770263,0.160132,Yes,2.083196,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
4,Morning,25,3,0.024812,7.0,0.060651,4,0.022055,9.0,0.055137,0.130473,0.646021,0.220993,Yes,1.729536,Bus_stop,Not Turn,Signal,Not Congestion,Not Adhoc


### Seperating target column and input columns

In [15]:
X = train_df[feature_names].copy()
y = train_df[poi_column].copy()

## Using SMOTE

In [16]:
smote = SMOTENC(categorical_features=categorical_features)
smote

SMOTENC(categorical_features=[0, 13])

In [17]:
X_resampled, y_resampled = smote.fit_sample(X, y)
print(X.shape)
print(y.shape)
print(X_resampled.shape)
print(y_resampled.shape)

(2480, 15)
(2480,)
(4492, 15)
(4492,)


## Categorical data Transformation

### For train data

In [18]:
train_array = X_resampled.values
les = [get_label_encoder(train_array, feature) for feature in categorical_features]
encoder = get_one_hot_encoder(train_array, categorical_features)

In [19]:
train_data = transform_categorical_features(train_array, encoder, les, categorical_features, for_train=True)
train_class_names,train_labels = get_labels_for(poi_column, pd.DataFrame(y_resampled))

### For test data

In [20]:
test_array = test_df[feature_names].values

In [21]:
test_data = transform_categorical_features(test_array, encoder, les, categorical_features)
test_class_names, test_labels = get_labels_for(poi_column, test_df)

# Training

In [22]:
model = RandomForestClassifier(n_estimators=20,max_depth=8)
model.fit(train_data, train_labels)
train_predictions = model.predict(train_data) # for training performance stat

# Testing

In [23]:
assert test_data.shape[1] == train_data.shape[1]

In [24]:
test_predictions = model.predict(test_data)

# Performance Stat

## Training Performance

In [25]:
get_performance_stats("train", train_labels, train_predictions)

For train: 
	Accuracy: 0.9004897595725735
	Precision: 0.903468998158077
	Recall: 0.9004897595725735
	F1 score: 0.9003057224844179
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      2246
           1       0.94      0.86      0.90      2246

    accuracy                           0.90      4492
   macro avg       0.90      0.90      0.90      4492
weighted avg       0.90      0.90      0.90      4492



## Testing Performance

In [26]:
get_performance_stats("test", test_labels, test_predictions)

For test: 
	Accuracy: 0.55
	Precision: 0.5491686157353722
	Recall: 0.55
	F1 score: 0.5494882754293781
              precision    recall  f1-score   support

           0       0.52      0.50      0.51       631
           1       0.58      0.59      0.59       729

    accuracy                           0.55      1360
   macro avg       0.55      0.55      0.55      1360
weighted avg       0.55      0.55      0.55      1360



In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion_matrix(test_labels, test_predictions)

array([[316, 315],
       [297, 432]])

In [29]:
# Bad results