In [None]:
import pandas as pd
import numpy as np
import math
import os
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from skimage import io
from skimage import feature

import matplotlib.pyplot as plt

import cv2
from numpy import inf


In [None]:
##### LOAD CSV DATASET
dataset = pd.read_csv('../input/dataset-aroma-tahu-berfomalin/dataset_tahu_berfomalin_yud.csv')
print(dataset.shape)
print(dataset.head)

In [None]:
col_names = dataset.columns

col_names

In [None]:
dataset.columns = dataset.columns.str.strip()
dataset.columns
dataset['label'].value_counts()

In [None]:
dataset['label'].value_counts()/np.float(len(dataset))

In [None]:
dataset['label'] = dataset['label'].astype(np.int64)

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
round(dataset.describe(),2)

In [None]:
# draw boxplots to visualize outliers

plt.figure(figsize=(24,20))


plt.subplot(4, 2, 1)
fig = dataset.boxplot(column='H2_MQ2(ppm)')
fig.set_title('')
fig.set_ylabel('H2_MQ2(ppm)')


plt.subplot(4, 2, 2)
fig = dataset.boxplot(column='LPG_MQ2(ppm)')
fig.set_title('')
fig.set_ylabel('LPG_MQ2(ppm)')


plt.subplot(4, 2, 3)
fig = dataset.boxplot(column='CO_MQ2(ppm)')
fig.set_title('')
fig.set_ylabel('CO_MQ2(ppm)')


plt.subplot(4, 2, 4)
fig = dataset.boxplot(column='Alcohol_MQ2(ppm)')
fig.set_title('')
fig.set_ylabel('Alcohol_MQ2(ppm)')


plt.subplot(4, 2, 5)
fig = dataset.boxplot(column='Propane_MQ2(ppm)')
fig.set_title('')
fig.set_ylabel('Propane_MQ2(ppm)')


plt.subplot(4, 2, 6)
fig = dataset.boxplot(column='CH4_MQ4(ppm)')
fig.set_title('')
fig.set_ylabel('CH4_MQ4(ppm)')


plt.subplot(4, 2, 7)
fig = dataset.boxplot(column='Smoke_MQ4(ppm)')
fig.set_title('')
fig.set_ylabel('Smoke_MQ4(ppm)')


plt.subplot(4, 2, 8)
fig = dataset.boxplot(column='Temperature(C)')
fig.set_title('')
fig.set_ylabel('Temperature(C)')

In [None]:
plt.figure(figsize=(24,20))


plt.subplot(4, 2, 1)
fig = dataset['H2_MQ2(ppm)'].hist(bins=20)
fig.set_xlabel('H2_MQ2(ppm)')
fig.set_ylabel('Number of pulsar stars')


plt.subplot(4, 2, 2)
fig = dataset['LPG_MQ2(ppm)'].hist(bins=20)
fig.set_xlabel('LPG_MQ2(ppm)')
fig.set_ylabel('Number of pulsar stars')


plt.subplot(4, 2, 3)
fig = dataset['CO_MQ2(ppm)'].hist(bins=20)
fig.set_xlabel('CO_MQ2(ppm)')
fig.set_ylabel('Number of pulsar stars')



plt.subplot(4, 2, 4)
fig = dataset['Alcohol_MQ2(ppm)'].hist(bins=20)
fig.set_xlabel('Alcohol_MQ2(ppm)')
fig.set_ylabel('Number of pulsar stars')



plt.subplot(4, 2, 5)
fig = dataset['Propane_MQ2(ppm)'].hist(bins=20)
fig.set_xlabel('Propane_MQ2(ppm)')
fig.set_ylabel('Number of pulsar stars')



plt.subplot(4, 2, 6)
fig = dataset['CH4_MQ4(ppm)'].hist(bins=20)
fig.set_xlabel('CH4_MQ4(ppm)')
fig.set_ylabel('Number of pulsar stars')



plt.subplot(4, 2, 7)
fig = dataset['Smoke_MQ4(ppm)'].hist(bins=20)
fig.set_xlabel('Smoke_MQ4(ppm)')
fig.set_ylabel('Number of pulsar stars')


plt.subplot(4, 2, 8)
fig = dataset['Temperature(C)'].hist(bins=20)
fig.set_xlabel('Temperature(C)')
fig.set_ylabel('Number of pulsar stars')

In [None]:
X = dataset.drop(['label'], axis=1)

y = dataset['label']

In [None]:

#split train n test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [None]:
X_train.shape, X_test.shape

In [None]:
cols = X_train.columns

In [None]:
print(y.head(5))

In [None]:
#SPLITTING VISUALIZATION
plt.figure(figsize=(10,13))
plt.subplot(2,2,1);y_train.value_counts().plot(kind='bar', color=['C0','C1','C2','C3','C4','C5','C6']);plt.title('training')
plt.subplot(2,2,2);y_test.value_counts().plot(kind='bar', color=['C0','C1','C2','C3','C4','C5','C6']);plt.title('testing')
plt.subplot(2,2,3);y_train.value_counts().plot(kind='pie');plt.title('training')
plt.subplot(2,2,4);y_test.value_counts().plot(kind='pie',);plt.title('testing')

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])

X_test = pd.DataFrame(X_test, columns=[cols])

X_train.describe()

In [None]:
#choose method
svc =  SVC()

kfold=KFold(n_splits=5, shuffle=True, random_state=0)

svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)


#crossvalidation
accuracy = cross_val_score(svc,X,y, cv=kfold, scoring='accuracy')
precision = cross_val_score(svc,X,y, cv=kfold, scoring='precision_weighted')
recall = cross_val_score(svc,X,y, cv=kfold, scoring='recall_weighted')
f1 = cross_val_score(svc,X,y, cv=kfold, scoring='f1_weighted')
print('accuray',  accuracy.mean())
print('precision' , precision.mean())
print('recall' ,recall.mean())
print('F1-Score' , f1.mean())




In [None]:
from sklearn.metrics import classification_report

cm = confusion_matrix(y_test, y_pred)
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

print(classification_report(y_test, y_pred))

In [None]:
#choose method
#method =  SVC()
method =  SVC(kernel='linear')
#method =  NuSVC()

#crossvalidation
accuracy = cross_val_score(method,X,y, cv=kfold, scoring='accuracy')
precision = cross_val_score(method,X,y, cv=kfold, scoring='precision_weighted')
recall = cross_val_score(method,X,y, cv=kfold, scoring='recall_weighted')
f1 = cross_val_score(method,X,y, cv=kfold, scoring='f1_weighted')
print('Menggunakan Metode Linear SVC')
print('accuray',  accuracy)
print('precision' , precision)
print('recall' ,recall)
print('F1-Score' , f1)


In [None]:
#choose method
#method =  SVC()
#method =  LinearSVC()
method =  NuSVC()

#crossvalidation
accuracy = cross_val_score(method,X,y, cv=kfold, scoring='accuracy')
precision = cross_val_score(method,X,y, cv=kfold, scoring='precision_weighted')
recall = cross_val_score(method,X,y, cv=kfold, scoring='recall_weighted')
f1 = cross_val_score(method,X,y, cv=kfold, scoring='f1_weighted')
print('accuray',  accuracy.mean())
print('precision' , precision.mean())
print('recall' ,recall.mean())
print('F1-Score' , f1.mean())


In [None]:
#BOXPLOT VISUALIZATION

fig1, ax1 = plt.subplots(figsize=(10,5))

#green_diamond = dict(markerfacecolor='g', marker='D')
red_square = dict(markerfacecolor='r', marker='s')


# grouping
all_data = [accuracy,precision,recall,f1]
ax1.set_title('performance - boxplot')

# plot box plot
ax1.boxplot(all_data,notch=False,flierprops=red_square)




#adding horizontal grid lines
ax1.yaxis.grid(True)
ax1.set_xticks([y +1 for y in range(len(all_data))])
ax1.set_xlabel('performa')
ax1.set_ylabel('score')

#add x-tick labels
plt.setp(ax1, xticks=[y+1 for y in range(len(all_data))],
         xticklabels=[ 'accuracy','precision','recall','f1_score'])
plt.show()

In [None]:
plt.figure(figsize=(10,7))
xx = ["cv1", "cv2", "cv3", "cv4", "cv5"] #, "cv6", "cv7", "cv8", "cv9", "cv10"
plt.plot(xx, accuracy, '--')
plt.plot(xx, precision, '--')
plt.plot(xx, recall, '--')
plt.plot(xx, f1, '--')
plt.title("comparison of each crossvalidation - SVM")
plt.xlabel("Crossvaldiation")
plt.ylabel("score")
plt.legend(["accuracy","precision", "recall", "f1-score"])
plt.grid()
plt.show()