In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from math import cos, asin, sqrt, pi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report
import os

In [2]:
#helper functions
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742000 * asin(sqrt(a))

def cumulative_distance(lat_longs):
    l=[]
    prev_lat,prev_long=lat_longs[0]

    for lat,long in lat_longs:
        l.append(distance(lat,long,prev_lat,prev_long)+1e-7)
        prev_lat=lat
        prev_long=long
        
    return l

def time_zone_cal(s):
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

In [3]:
def get_processed_df(df, selection_columns=\
    ['time_zone','stay_duration',
     'wifi_count', 'edge_wifi_count', 'honk_duration',
     'edge_honk_duration', 'd_wifi_count', 'd_edge_wifi_count','d_honk_duration', 
     'd_edge_honk_duration', 'human_made', 'natural_land','road_exist_percent',
     'highly_populated_poi_exist', 'RSI', 'Is_Bus_stop','Is_Turn', 'Is_Signal',
     'Is_Congestion', 'Is_Adhoc']):

    #Normalizing edge values with edge distance
    df['next_hop_distance']=cumulative_distance(df[['lat','long']].values)

    df['edge_wifi_count']=df.edge_wifi_count/df.next_hop_distance
    df['d_edge_wifi_count']=df.d_edge_wifi_count/df.next_hop_distance

    df['edge_honk_duration']=df.edge_honk_duration.apply(float)/df.next_hop_distance
    df['d_edge_honk_duration']=df.d_edge_honk_duration.apply(float)/df.next_hop_distance

    df['honk_duration']=df.honk_duration.apply(float)
    df['d_honk_duration']=df.d_honk_duration.apply(float)

    #calculating timeZone
    df['time_zone']=df.start_time.apply(time_zone_cal)

    #Rebuilding meaning full features from sparse features
    df['highly_populated_poi_exist']=(df.school+df.medical+df.other_poi+df.park).apply(lambda e:'Yes' if np.ceil(e)==1 else 'No')
    df['road_exist_percent']=df.high_way+df.two_way+df.one_way

    #New dataFrame is returned
    new_df=df[selection_columns].copy()
    
    return new_df

In [4]:
def transform_categorical_features(data, one_hot_encoder, label_encoders, categorical_features, for_train=False):
    
    if len(label_encoders) != len(categorical_features):
        raise ValueError("Number of Label Encoders must be equal to number of categorical features.")
    
    #converting categorical features in to integer encoding....
    for le, feature in zip(label_encoders, categorical_features):
        data[:, feature] = le.transform(data[:, feature])

    # OneHot Encoding Categorical features
    if for_train:
        data = one_hot_encoder.fit_transform(data)
    else:
        data = one_hot_encoder.transform(data)
    #data=data.toarray()
    return data

In [5]:
# def transform_categorical_features2():
#     train_df[feature_names[0]] = train_df[feature_names[0]].apply(lambda x: le_dicts[0].get(x, x))
#     train_df[feature_names[13]] = train_df[feature_names[13]].apply(lambda x: le_dicts[1].get(x, x))

In [6]:
def get_label_encoder(data, feature):
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data[:, feature])
    return le

def get_one_hot_encoder(data, categorical_features):
    encoder = ColumnTransformer([('encoder',sklearn.preprocessing.OneHotEncoder(),categorical_features)],remainder='passthrough')
    encoder.fit(data)
    return encoder

In [7]:
# def get_le_dict(data, feature):
#     le = sklearn.preprocessing.LabelEncoder()
#     le.fit(data[:, feature])
#     le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
#     return le_dict

In [8]:
def get_labels_for(poi_column, df):
    labels=df[poi_column]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    return class_names,labels

convert_to_class_names = lambda x, class_names: class_names[x]

In [9]:
def get_performance_stats(test_or_train, true_values, predicted_values):
    print(f"For {test_or_train}: ")
    print(f"\tAccuracy: {accuracy_score(true_values, predicted_values)}")
    print(f"\tPrecision: {precision_score(true_values, predicted_values,average='weighted')}")
    print(f"\tRecall: {recall_score(true_values, predicted_values,average='weighted')}")
    print(f"\tF1 score: {f1_score(true_values, predicted_values,average='weighted')}")
    print(classification_report(true_values, predicted_values))

In [10]:
def form_result_df(original_df, predictions):
    result_df = original_df.copy()
    result_df["instance_date"] = result_df.start_date
    result_df["instance_time"] = result_df.start_time
    result_df[f"Prediction {poi_column}"] = predictions
    return result_df[["instance_date", "instance_time", f"Prediction {poi_column}"]]

In [11]:
parent_dir = r"../data/54ft/"
train_df_name = r"54ft_train.csv"
test_df_name = r"54ft_test.csv"

In [12]:
poi_column = 'Is_Bus_stop'

In [13]:
train_csv_df = pd.read_csv(os.path.join(parent_dir, train_df_name))
train_df = get_processed_df(train_csv_df)

In [14]:
train_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Afternoon,6,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.111232,0.636805,0.251963,No,0.0,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
2,Afternoon,130,2,0.0278,5.0,0.0,4,0.018533,5.0,0.0,0.10036,0.743668,0.155972,No,2.422751,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc
3,Afternoon,13,0,0.0,2.0,0.0,0,0.0,2.0,0.0,0.100911,0.730013,0.169076,No,3.04042,Not Bus_stop,Not Turn,Not Signal,Not Congestion,Adhoc
4,Afternoon,9,0,0.002818,2.0,0.005635,0,0.002818,2.0,0.005635,0.117394,0.487688,0.183064,Yes,1.78086,Bus_stop,Turn,Not Signal,Not Congestion,Not Adhoc


In [15]:
feature_names=['time_zone','stay_duration','wifi_count', 'edge_wifi_count', 'honk_duration',\
               'edge_honk_duration', 'd_wifi_count','d_edge_wifi_count','d_honk_duration',\
               'd_edge_honk_duration', 'human_made','natural_land','road_exist_percent',\
               'highly_populated_poi_exist', 'RSI']

categorical_features=[0,13]

In [16]:
train_array = train_df[feature_names].values
les = [get_label_encoder(train_array, feature) for feature in categorical_features]
# le_dicts = [get_le_dict(train_array, feature)  for feature in categorical_features]
encoder = get_one_hot_encoder(train_array, categorical_features)

In [31]:
print(train_df[feature_names].shape)
print(train_array.shape)

(2480, 15)
(2480, 15)


In [17]:
# train_df[feature_names[0]] = train_df[feature_names[0]].apply(lambda x: le_dicts[0].get(x, x))
# train_df[feature_names[13]] = train_df[feature_names[13]].apply(lambda x: le_dicts[1].get(x, x))

In [18]:
# train_df.head()

In [19]:
train_data = transform_categorical_features(train_array, encoder, les, categorical_features, for_train=True)
train_class_names,train_labels = get_labels_for(poi_column, train_df)
# train_data = train_df[feature_names].values

In [32]:
print(train_data.shape)

(2480, 19)


In [41]:
print(f"Train Array: {list(train_array[1])}")
print(f"Train data: {list(train_data[1])}")
train_df[feature_names].iloc[1:2]

Train Array: [0, 10, 3, 0.015468290768428074, 1.0, 0.015468290768428074, 3, 0.015468290768428074, 1.0, 0.015468290768428074, 0.12486881980167396, 0.6665868845577432, 0.20854429564058288, 0, 2.7150453559430217]
Train data: [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 10, 3, 0.015468290768428074, 1.0, 0.015468290768428074, 3, 0.015468290768428074, 1.0, 0.015468290768428074, 0.12486881980167396, 0.6665868845577432, 0.20854429564058288, 2.7150453559430217]


Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI
1,Afternoon,10,3,0.015468,1.0,0.015468,3,0.015468,1.0,0.015468,0.124869,0.666587,0.208544,No,2.715045


In [43]:
model = RandomForestClassifier(n_estimators=20,max_depth=8)
model.fit(train_data, train_labels)
train_predictions = model.predict(train_data) # for training performance stat

In [44]:
test_csv_df = pd.read_csv(os.path.join(parent_dir, test_df_name))
test_df = get_processed_df(test_csv_df)

In [45]:
test_df.head()

Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI,Is_Bus_stop,Is_Turn,Is_Signal,Is_Congestion,Is_Adhoc
0,Morning,105,7,0.0,69.0,0.0,7,0.0,69.0,0.0,0.122769,0.692291,0.182053,Yes,0.0,Bus_stop,Turn,Not Signal,Congestion,Not Adhoc
1,Morning,12,2,0.024291,9.0,0.083285,2,0.024291,11.0,0.076344,0.107162,0.766882,0.125955,No,1.289747,Not Bus_stop,Not Turn,Not Signal,Congestion,Not Adhoc
2,Morning,4,0,0.005824,0.0,0.049507,0,0.005824,1.0,0.048051,0.093122,0.709518,0.132148,Yes,1.302277,Not Bus_stop,Not Turn,Not Signal,Congestion,Not Adhoc
3,Morning,11,1,0.005213,1.0,0.049528,1,0.005213,1.0,0.049528,0.0639,0.770263,0.160132,Yes,2.083196,Bus_stop,Not Turn,Not Signal,Not Congestion,Not Adhoc
4,Morning,25,3,0.024812,7.0,0.060651,4,0.022055,9.0,0.055137,0.130473,0.646021,0.220993,Yes,1.729536,Bus_stop,Not Turn,Signal,Not Congestion,Not Adhoc


In [46]:
test_array = test_df[feature_names].values

In [47]:
test_data = transform_categorical_features(test_array, encoder, les, categorical_features)
test_class_names, test_labels = get_labels_for(poi_column, test_df)

In [48]:
print(f"Train Array: {list(test_array[1])}")
print(f"Train data: {list(test_data[1])}")
test_df[feature_names].iloc[1:2]

Train Array: [3, 12, 2, 0.02429142213387779, 9.0, 0.08328487588758099, 2, 0.02429142213387779, 11.0, 0.07634446956361592, 0.10716247530839497, 0.7668821460142384, 0.12595537867736667, 0, 1.2897473552549503]
Train data: [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 12, 2, 0.02429142213387779, 9.0, 0.08328487588758099, 2, 0.02429142213387779, 11.0, 0.07634446956361592, 0.10716247530839497, 0.7668821460142384, 0.12595537867736667, 1.2897473552549503]


Unnamed: 0,time_zone,stay_duration,wifi_count,edge_wifi_count,honk_duration,edge_honk_duration,d_wifi_count,d_edge_wifi_count,d_honk_duration,d_edge_honk_duration,human_made,natural_land,road_exist_percent,highly_populated_poi_exist,RSI
1,Morning,12,2,0.024291,9.0,0.083285,2,0.024291,11.0,0.076344,0.107162,0.766882,0.125955,No,1.289747


In [26]:
assert test_data.shape[1] == train_data.shape[1]

In [27]:
train_set = set()
test_set = set()
for i in train_df.time_zone:
    train_set.add(i)
for i in test_df.time_zone:
    test_set.add(i)
    
print(train_set)
print(test_set)
print(train_set-test_set)

{'Early_Morning', 'Evening', 'Afternoon', 'Morning'}
{'Evening', 'Afternoon', 'Morning'}
{'Early_Morning'}


In [28]:
test_predictions = model.predict(test_data)

In [29]:
get_performance_stats("test", test_labels, test_predictions)

For test: 
	Accuracy: 0.8088235294117647
	Precision: 0.8144257703081231
	Recall: 0.8088235294117647
	F1 score: 0.8025210084033613


In [30]:
get_performance_stats("train", train_labels, train_predictions)

For train: 
	Accuracy: 0.9254032258064516
	Precision: 0.9254703449749915
	Recall: 0.9254032258064516
	F1 score: 0.9249866236496553


In [None]:
test_df[poi_column]

In [None]:
dates = test_csv_df.start_date
dates

In [None]:
test_predictions

In [None]:
# convert_to_class_names(test_predictions, test_class_names)
result_df = form_result_df(test_csv_df, convert_to_class_names(test_predictions, test_class_names))
result_df

In [None]:
convert_to_class_names(test_predictions, test_class_names)