# Model Making Process

## Preprocess the data

### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, ConfusionMatrixDisplay

import platform
import os
import time

print('Python ver:', platform.python_version())
print('Scikit-learn ver:', sklearn.__version__)
print('Pandas ver:', pd.__version__)
print('Numpy ver:', np.__version__)

### Access the Dataset

In [None]:
# Read csv and check the data
cwd = os.path.dirname(os.path.abspath('__file__'))
dataset_dir = os.path.join(cwd, '..', 'Transformed dataset')
mean_path = os.path.join(dataset_dir, 'transformed_mean.csv')
var_path = os.path.join(dataset_dir, 'transformed_var.csv')

mean_csv = pd.read_csv(mean_path)
var_csv = pd.read_csv(var_path)

In [None]:
mean_csv.head()

In [None]:
mean_csv.count()

In [None]:
var_csv.head()

In [None]:
var_csv.count()

### Merging the dataset into a single DataFrame

In [None]:
mean_csv = mean_csv.drop(columns=["label"])
mean_csv = mean_csv.add_suffix("_mean")
mean_csv.columns = mean_csv.columns.str.replace("Unnamed: 0_mean", "index")
mean_csv.columns

In [None]:
var_csv = var_csv.add_suffix("_var")
var_csv.columns = var_csv.columns.str.replace("Unnamed: 0_var", 'index')
var_csv.columns = var_csv.columns.str.replace("label_var", 'label')
var_csv.columns

In [None]:
merged = mean_csv.merge(var_csv, on='index')
merged.columns

In [None]:
merged.count()

### Optional cell

Run this cell only if you want to export the DataFrame into a csv to work in another machine or workspace

In [None]:
# merged.to_csv('merged.csv', index=True) # Keep in mind that the new csv will have it's index.

## Making the model

### Assign variables

In [None]:
features = merged.drop(['label', 'index'], axis=1)
features.columns

In [None]:
X = features.values
X

In [None]:
Y = merged['label'].values
Y

### K-Fold test

In [None]:
model_to_evaluate = []
model_to_evaluate.append(('KNN', KNeighborsClassifier(), 'K-nearest Neighbor'))
model_to_evaluate.append(('DT', DecisionTreeClassifier(), 'DecisionTree'))
model_to_evaluate.append(('NB', GaussianNB(), 'Gaussian Naive-Bayes'))
model_to_evaluate.append(('SVM', SVC(), 'Support Vector Machine'))

In [None]:
results=[]
names=[]

for name, model, full_name in model_to_evaluate:
    kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
    cv_result = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    results.append(cv_result)
    names.append(name)
    print(f'Akurasi {name}: {cv_result.mean()} dengan std dev {cv_result.std()}')
    print()

In [None]:
plt.figure(figsize=(6,6))
plt.boxplot(list(map(lambda item: item*100, results)), labels=names)
plt.title('Hasil Uji Akurasi Validasi dengan Stratified K-Fold Lipat 3', fontdict={'fontweight': 'normal', 'fontsize':'14'})
plt.xlabel('Classifier')
plt.ylabel('CV Result (in %)')
# plt.savefig('CV Result.png', dpi=300)
plt.show()


### Test each of the algorithm

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.10, random_state=1)

In [None]:
plot_label = ['Tidak Jatuh', 'Jatuh']

In [None]:
for name, model, full_name in model_to_evaluate:
    clf = model
    clf.fit(X_train, Y_train)
    predictions = clf.predict(X_val)
    print(classification_report(Y_val, predictions))
    disp = ConfusionMatrixDisplay.from_predictions(Y_val, predictions, display_labels=plot_label)
    disp.ax_.set_title(f'Confusion Matrix Result of {full_name}')

## Voting Classifier (optional)

In [None]:
# from sklearn.ensemble import VotingClassifier

In [None]:
# dt = DecisionTreeClassifier()
# knn = KNeighborsClassifier()
# nb = GaussianNB()
# svm = SVC()

# final_model = VotingClassifier(
#     estimators=[
#         ("decision_tree", dt),
#         ("k-nearest_neighbor", knn),
#         ("gaussnb", nb),
#         ("svm", svm)  
#     ],
#     voting="soft",
# )

In [None]:
# final_model.fit(X_train, Y_train)

In [None]:
# start_time = time.time()
# voting_prediction = final_model.predict(X_val)
# end_time = time.time()

# print(f'Predictions took {(end_time-start_time)*1e3} ms')
# print(classification_report(Y_val, voting_prediction))
# ConfusionMatrixDisplay.from_predictions(Y_val, voting_prediction)