### train support vector machine, decision tree, and random forrest
This script creates training, validation, and test dataset first. Then it uses a few ML models (SVM, DT, and RF) to classify the samples.

In [1]:
import numpy as np
from numpy import genfromtxt
my_data = genfromtxt('data/smell_dataset.csv', delimiter=',')

In [2]:
X = my_data[1:,3:14]
# X = my_data[1:,[11]]
Y = my_data[1:,14:]

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_data(X, Y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    for train_index, test_index in sss.split(X, Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        return x_train, y_train, x_test, y_test

In [4]:
from sklearn.utils import resample, shuffle


def balance_dataset(x, y):
        x_benign = x[ (y.ravel()==0)]
        x_smelly = x[ (y.ravel()==1)]
        x_downsampled = resample(x_benign, replace=False, n_samples=len(x_smelly), random_state=145)
        x_balanced = np.concatenate((x_smelly, x_downsampled))
        y_balanced = np.empty(shape=[len(x_balanced)], dtype=np.int16)
        y_balanced[0:len(x_smelly)] = 1.0
        y_balanced[len(x_smelly):] = 0.0
        x_balanced, y_balanced = shuffle(x_balanced, y_balanced)
        return x_balanced, y_balanced

def get_balanced_training_data(X, Y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    for train_index, test_index in sss.split(X, Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        x_train, y_train = balance_dataset(x_train, y_train)
        # x_test, y_test = balance_dataset(x_test, y_test)
        return x_train, y_train, x_test, y_test

In [5]:
from sklearn.metrics import confusion_matrix

def compute_accuracy(y_test, y_pred):
    cm = confusion_matrix(y_test,y_pred)
    accuracy = float(cm.diagonal().sum())/len(y_test)
    print("\nModel accuracy: ", accuracy)

In [6]:
from sklearn.metrics import recall_score, precision_score

def compute_precision_recall(y_test, y_pred):
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print (f"precision: {precision}, recall: {recall}")

In [7]:
from sklearn.svm import SVC

def svm(x_train, y_train, x_test, y_test):
    classifier = SVC(kernel='rbf', random_state = 1)
    classifier.fit(x_train,y_train.ravel())
    y_pred = classifier.predict(x_test)
    compute_accuracy(y_test, y_pred)
    compute_precision_recall(y_test, y_pred)

In [8]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree(x_train, y_train, x_test, y_test):
    clf = DecisionTreeClassifier()
    clf = clf.fit(x_train,y_train)
    y_pred= clf.predict(x_test)
    compute_accuracy(y_test, y_pred)
    compute_precision_recall(y_test, y_pred)

In [9]:
from sklearn.ensemble import RandomForestClassifier

def random_forrest(x_train, y_train, x_test, y_test):
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    compute_accuracy(y_test, y_pred)
    compute_precision_recall(y_test, y_pred)

In [10]:
x_train, y_train, x_test, y_test = get_balanced_training_data(X, Y)

In [11]:
svm(x_train, y_train, x_test, y_test)


Model accuracy:  0.9198659354052408
precision: 0.011320754716981131, recall: 0.75


In [12]:
decision_tree(x_train, y_train, x_test, y_test)


Model accuracy:  0.858622790981109
precision: 0.008547008547008548, recall: 1.0


In [13]:
random_forrest(x_train, y_train, x_test, y_test)


Model accuracy:  0.8863497867154174
precision: 0.010610079575596816, recall: 1.0


In [14]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes(x_train, y_train, x_test, y_test):
    nb = GaussianNB()
    nb.fit(x_train,y_train)
    y_pred=nb.predict(x_test)
    compute_accuracy(y_test, y_pred)
    compute_precision_recall(y_test, y_pred)

In [15]:
naive_bayes(x_train, y_train, x_test, y_test)


Model accuracy:  0.7379646556977453
precision: 0.004629629629629629, recall: 1.0
