In [1]:
from skimage import feature
from imutils import paths
import numpy as np
import cv2 as cv
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
from imutils import build_montages
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import idx2numpy
import kagglehub


In [2]:
mnist_asl_dir = kagglehub.dataset_download("datamunge/sign-language-mnist")
mnist_handwritten_digit_dir = kagglehub.dataset_download("hojjatk/mnist-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/datamunge/sign-language-mnist?dataset_version_number=1...


100%|██████████| 62.6M/62.6M [00:12<00:00, 5.30MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/hojjatk/mnist-dataset?dataset_version_number=1...


100%|██████████| 22.0M/22.0M [00:04<00:00, 5.00MB/s]

Extracting files...





In [3]:
print("Path to dataset files:", mnist_asl_dir)
print("Path to dataset files:", mnist_handwritten_digit_dir)

Path to dataset files: C:\Users\user\.cache\kagglehub\datasets\datamunge\sign-language-mnist\versions\1
Path to dataset files: C:\Users\user\.cache\kagglehub\datasets\hojjatk\mnist-dataset\versions\1


In [6]:
files1 = os.listdir(mnist_asl_dir)
print(files1)
files2 = os.listdir(mnist_handwritten_digit_dir)
print(files2)

['american_sign_language.PNG', 'amer_sign2.png', 'amer_sign3.png', 'sign_mnist_test', 'sign_mnist_test.csv', 'sign_mnist_train', 'sign_mnist_train.csv']
['t10k-images-idx3-ubyte', 't10k-images.idx3-ubyte', 't10k-labels-idx1-ubyte', 't10k-labels.idx1-ubyte', 'train-images-idx3-ubyte', 'train-images.idx3-ubyte', 'train-labels-idx1-ubyte', 'train-labels.idx1-ubyte']


# Preprocessing

In [7]:
def preprocessing(image, image_size):
  image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
  image = cv.resize(image, (image_size,image_size))
  image = cv.threshold(image, 0, 255, cv.THRESH_BINARY_INV | cv.THRESH_OTSU)[1]
  return image

In [8]:
def load_split(path, image_size, extraction_method):
  image_paths = list(paths.list_images(path))
  data = []
  labels = []
  
  for image_path in image_paths :
    label = image_path.split(os.path.sep)[-2]
    image = cv.imread(image_path)
    image = preprocessing(image, image_size = image_size)

    if extraction_method == 'hog':
      features = quantify_image_hog(image)
    elif extraction_method == 'lbp':
      features = quantify_image_lbp(image)

    data.append(features)
    labels.append(label)

  return (np.array(data), np.array(labels))

In [35]:
def load_split_from_csv(csv_path, image_size, extraction_method):
    df = pd.read_csv(csv_path)

    labels = df.iloc[:, 0].values
    pixels = df.iloc[:, 1:].values

    data = []
    for i in range(pixels.shape[0]):
        image = pixels[i].reshape(28, 28).astype("uint8")
        image = cv.resize(image, (image_size, image_size))
        image = cv.threshold(image, 0, 255, cv.THRESH_BINARY_INV | cv.THRESH_OTSU)[1]

        if extraction_method == 'hog':
            features = quantify_image_hog(image)
        elif extraction_method == 'lbp':
            features = quantify_image_lbp(image)
        else:
            raise ValueError("Unsupported extraction method: " + extraction_method)

        data.append(features)

    return np.array(data), np.array(labels)

# Feature Extractor

In [9]:
# HOG
def quantify_image_hog(image): # Histogram of Oriented Gradient features
  features = feature.hog(image, orientations=9, pixels_per_cell=(10, 10), cells_per_block=(2, 2), transform_sqrt=True, block_norm="L1")
  
  
  return features

In [10]:
# LBP
def quantify_image_lbp(image): # Local Binary Pattern features
  features = feature.local_binary_pattern(image, 24, 8, method="uniform")
  (hist, _) = np.histogram(features.flatten(), bins=np.arange(0, 27), range=(0, 26))
  
  hist = hist.astype("float")
  hist /= (hist.sum() + 1e-7)
  return hist

# Function

In [11]:
def scores(obj, predict, feature2, label2):
    print('Accuracy   on test set: {:.3f}'.format(obj.score(feature2, label2)))
    print('F1_score   on test set: {:.3f}'.format(f1_score(label2, predict, average='macro')))
    print('Precision  on test set: {:.3f}'.format(precision_score(label2, predict, average='macro')))
    print('Recall     on test set: {:.3f}'.format(recall_score(label2, predict, average='macro')))

In [12]:
def test(path, resize_image_size, model):
  testing_paths = list(paths.list_images(path))
  images = []

  for testing_path in testing_paths[:25]:
    image = cv.imread(testing_path)
    output = image.copy()
    output = cv.resize(output, (128, 128))
    
    image = preprocessing(image, image_size = resize_image_size)
    
    if extraction_method == 'hog':
      features = quantify_image_hog(image)
    elif extraction_method == 'lbp':
      features = quantify_image_lbp(image)
    
    preds = model.predict([features])
    label = le.inverse_transform(preds)[0]
    color = (0, 255, 0) if label == "healthy" else (0, 0, 255)
    cv.putText(output, label, (3, 20), cv.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    images.append(output)
  
  montage = build_montages(images, (128, 128), (5, 5))[0]
  cv2_imshow(montage)

# Algorithm

In [13]:
def logistic_regression(feature_train,label_train,feature_test,label_test):
  print("Logistic Regression")
  logreg = LogisticRegression(max_iter=1000,random_state=10)
  logreg.fit(feature_train, label_train)
  cross_val= cross_val_score(logreg, feature_train,label_train, cv=10)
  print("Cross Validation Score : "+str(cross_val))
  predictions = logreg.predict(feature_test)
  scores(logreg,predictions,feature_test,label_test)
  cm = confusion_matrix(label_test,predictions)
  cr = classification_report(label_test, predictions)
  print(cm)
  print(cr)
  print("------------------------------------------------------------------")
  print()
  return logreg


In [14]:
def linear_svc(feature_train,label_train,feature_test,label_tes):
  print("Linear SVC")
  svc = LinearSVC(max_iter=10000,random_state=10,C=10.0)
  svc.fit(feature_train, label_train)
  cross_val= cross_val_score(svc, feature_train,label_train, cv=10)
  print("Cross Validation Score : "+str(cross_val))
  predictions = svc.predict(feature_test)
  scores(svc,predictions,feature_test,label_test)
  cm = confusion_matrix(label_test,predictions)
  cr = classification_report(label_test, predictions)
  print(cm)
  print(cr)
  print("------------------------------------------------------------------")
  print()
  return svc

In [15]:
def random_forest(feature_train, label_train, feature_test, label_test):
  print("Random Forest")
  rfc = RandomForestClassifier(n_estimators=100,random_state=10)
  rfc.fit(feature_train, label_train)
  cross_val= cross_val_score(rfc, feature_train,label_train, cv=10)
  print("Cross Validation Score : "+str(cross_val))
  predictions = rfc.predict(feature_test)
  scores(rfc,predictions,feature_test,label_test)
  cm = confusion_matrix(label_test,predictions)
  cr = classification_report(label_test, predictions)
  print(cm)
  print(cr)
  print("------------------------------------------------------------------")
  print()
  return rfc

In [16]:
def KNN(feature_train, label_train, feature_test, label_test):
  print("KNN")
  knn = KNeighborsClassifier(n_neighbors=11,weights='distance',algorithm='auto')
  knn.fit(feature_train,label_train)
  cross_val= cross_val_score(knn, feature_train,label_train, cv=10)
  print("Cross Validation Score : "+str(cross_val))
  predictions = knn.predict(feature_test)
  scores(knn,predictions,feature_test,label_test)
  cm = confusion_matrix(label_test,predictions)
  cr = classification_report(label_test, predictions)
  print(cm)
  print(cr)
  print("------------------------------------------------------------------")
  print()
  return knn

In [17]:
def MLP(feature_train, label_train, feature_test, label_test):
  print("MLP")
  mlp = MLPClassifier(hidden_layer_sizes=10, activation='relu',solver='lbfgs',batch_size='auto', learning_rate_init=0.0001, max_iter=10000,early_stopping=False)
  mlp.fit(feature_train,label_train)
  cross_val= cross_val_score(mlp, feature_train,label_train, cv=10)
  print("Cross Validation Score : "+str(cross_val))
  predictions = mlp.predict(feature_test)
  scores(mlp, predictions,feature_test,label_test)
  cm = confusion_matrix(label_test,predictions)
  cr = classification_report(label_test, predictions)
  print(cm)
  print(cr)
  print("------------------------------------------------------------------")
  print()
  return mlp

# Using HOG 128

In [30]:
trainingPath = mnist_asl_dir+"\\sign_mnist_train.csv"
testingPath = mnist_asl_dir+"\\sign_mnist_test.csv"

In [31]:
trainingPath,testingPath

('C:\\Users\\user\\.cache\\kagglehub\\datasets\\datamunge\\sign-language-mnist\\versions\\1\\sign_mnist_train.csv',
 'C:\\Users\\user\\.cache\\kagglehub\\datasets\\datamunge\\sign-language-mnist\\versions\\1\\sign_mnist_test.csv')

In [None]:
load_split_from

In [37]:
resize_image_size = 128
extraction_method = 'hog'

(feature_train, label_train) = load_split_from_csv(trainingPath, image_size= resize_image_size, extraction_method = extraction_method)
(feature_test, label_test) = load_split_from_csv(testingPath, image_size= resize_image_size, extraction_method = extraction_method)

print("Data loaded!")
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)


Data loaded!


## Training

In [None]:
logreg = logistic_regression(feature_train,label_train,feature_test,label_test)
svc = linear_svc(feature_train,label_train,feature_test,label_test)
rfc = random_forest(feature_train,label_train,feature_test,label_test)
knn = KNN(feature_train,label_train,feature_test,label_test)
mlc = MLP(feature_train,label_train,feature_test,label_test)

Logistic Regression


## Testing

In [None]:
# Testing Logistic Regression
images = test(testingPath,resize_image_size,logreg)

In [None]:
# Testing SVC
images = test(testingPath,resize_image_size,svc)

In [None]:
# Testing Random Forest
images = test(testingPath,resize_image_size,rfc)

In [None]:
# Testing KNN
images = test(testingPath,resize_image_size,knn)

In [None]:
# Testing MLC
images = test(testingPath,resize_image_size,mlc)

# Using HOG 300

In [None]:
resize_image_size = 300
extraction_method = 'hog'

(feature_train, label_train) = load_split(path = trainingPath, image_size= resize_image_size, extraction_method = extraction_method)
(feature_test, label_test) = load_split(path = testingPath, image_size= resize_image_size, extraction_method = extraction_method)

print("Data loaded!")
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)


In [None]:
print(label_train)
print(label_test)

## Training

In [None]:
logreg = logistic_regression(feature_train,label_train,feature_test,label_test)
svc = linear_svc(feature_train,label_train,feature_test,label_test)
rfc = random_forest(feature_train,label_train,feature_test,label_test)
knn = KNN(feature_train,label_train,feature_test,label_test)
mlc = MLP(feature_train,label_train,feature_test,label_test)


## Testing

In [None]:
# Testing Logistic Regression
images = test(testingPath,resize_image_size,logreg)

In [None]:
# Testing SVC
images = test(testingPath,resize_image_size,svc)

In [None]:
# Testing Random Forest
images = test(testingPath,resize_image_size,rfc)

In [None]:
# Testing KNN
images = test(testingPath,resize_image_size,knn)

In [None]:
# Testing MLC
images = test(testingPath,resize_image_size,mlc)

# Using LBP 128

In [None]:
resize_image_size = 128
extraction_method = 'lbp'

(feature_train, label_train) = load_split(path = trainingPath, image_size= resize_image_size, extraction_method = extraction_method)
(feature_test, label_test) = load_split(path = testingPath, image_size= resize_image_size, extraction_method = extraction_method)

print("Data loaded!")
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)

In [None]:
print(label_train)
print(label_test)

## Training

In [None]:
logreg = logistic_regression(feature_train,label_train,feature_test,label_test)
svc = linear_svc(feature_train,label_train,feature_test,label_test)
rfc = random_forest(feature_train,label_train,feature_test,label_test)
knn = KNN(feature_train,label_train,feature_test,label_test)
mlc = MLP(feature_train,label_train,feature_test,label_test)

## Testing 

In [None]:
# Testing logreg
images = test(testingPath,resize_image_size,logreg)

In [None]:
# Testing SVC
images = test(testingPath,resize_image_size,svc)

In [None]:
# Testing Random Forest
images = test(testingPath,resize_image_size,rfc)

In [None]:
# Testing KNN
images = test(testingPath,resize_image_size,knn)

In [None]:
# Testing MLC
images = test(testingPath,resize_image_size,mlc)

# Using LBP 300

In [None]:
resize_image_size = 300
extraction_method = 'lbp'

(feature_train, label_train) = load_split(path = trainingPath, image_size= resize_image_size, extraction_method = extraction_method)
(feature_test, label_test) = load_split(path = testingPath, image_size= resize_image_size, extraction_method = extraction_method)

print("Data loaded!")
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)

In [None]:
print(label_train)
print(label_test)

## Training

In [None]:
logreg = logistic_regression(feature_train,label_train,feature_test,label_test)
svc = linear_svc(feature_train,label_train,feature_test,label_test)
rfc = random_forest(feature_train,label_train,feature_test,label_test)
knn = KNN(feature_train,label_train,feature_test,label_test)
mlc = MLP(feature_train,label_train,feature_test,label_test)

## Testing

In [None]:
# Testing Logistic Regression
images = test(testingPath,resize_image_size,logreg)

In [None]:
# Testing SVC
images = test(testingPath,resize_image_size,svc)

In [None]:
# Testing Random Forest
images = test(testingPath,resize_image_size,rfc)

In [None]:
# Testing KNN
images = test(testingPath,resize_image_size,knn)

In [None]:
# Testing MLC
images = test(testingPath,resize_image_size,mlc)

# Kesimpulan
HOG 128 dgn KNN(11) menghasilkan akurasi 83%

HOG 300 dgn LinearSVC menghasilkan akurasi 73%

LBP 128 dgn KNN(11) menghasilkan akurasi 60%

LBP 300 dgn RandomForest menghasilkan akurasi 70%

Dapat disimpulkan bahwa ukuran citra yang digunakan tidak banyak mempengaruhi
hasil prediksi suatu algoritma. yang menentukan adalah bagaimana suatu fitur pada citra itu dapat diekstraksi, sehingga algoritma dapat mempelajari dan mengenali fitur dengan baik

# Menambahkan data

## Data Sehat

In [None]:
sehat = cv.imread('/content/drive/My Drive/IMG_20200925_130819.jpg')
cv2_imshow(sehat)

In [None]:
sehat1 = cv.cvtColor(sehat,cv.COLOR_BGR2GRAY)
cv2_imshow(sehat1)
sehat1 = cv.equalizeHist(sehat1)

In [None]:
 sehat1 = cv.medianBlur(sehat1,3)
 cv2_imshow(sehat1)

In [None]:
threshold_value, threshold_result = cv.threshold(sehat1, 3, 255, cv.THRESH_BINARY_INV) 
cv2_imshow(threshold_result)

In [None]:

sehat2 = cv.dilate(threshold_result,np.ones((5,5),np.uint8),iterations = 5)
cv2_imshow(sehat2)

In [None]:
contours, hierarchy = cv.findContours(sehat2, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
area_array=[]
for index,contour in enumerate(contours):
  x,y,w,h = cv.boundingRect(contour)
  area = cv.contourArea(contour)
  area_array.append([index,area,w,h])

df_area =pd.DataFrame(area_array)
df_area.columns =['index','area','width','height']
df_sort=df_area.sort_values(by=['area'],ascending=False)
df_sort.head(50)


In [None]:
test = df_area['area']
array=[]
directory_save = '/content/drive/My Drive/Colab Notebooks/parkinsons/wave/training/healthy'

for i,va in enumerate(test):
  if test[i] > 50000:
    array.append(i)

array.sort()
print(array)
os.chdir(directory_save)

for i, va in enumerate(array):
  x,y,w,h =cv.boundingRect(contours[va])
  wave= sehat[y:y+h, x:x+w]
  resized= cv.resize(wave,(512,220),interpolation = cv.INTER_AREA)
  cv.imwrite('wave{}.png'.format(i),resized)

print(os.listdir(directory_save))

## Data Parkinson

In [None]:
sakit = cv.imread('/content/drive/My Drive/IMG_20200925_130639.jpg')
cv2_imshow(sakit)

In [None]:
sakit1 = cv.cvtColor(sakit,cv.COLOR_BGR2GRAY)
cv2_imshow(sakit1)
sakit1 = cv.equalizeHist(sakit1)

In [None]:
 sakit1 = cv.medianBlur(sakit1,3)
 cv2_imshow(sakit1)

In [None]:
threshold_value, threshold_result = cv.threshold(sakit1, 5, 255, cv.THRESH_BINARY_INV) 
cv2_imshow(threshold_result)

In [None]:
erosion = cv.erode(threshold_result,np.ones((1,1),np.uint8),iterations =1)
cv2_imshow(erosion)
sakit2 = cv.dilate(erosion,np.ones((5,5),np.uint8),iterations = 7)
cv2_imshow(sakit2)

In [None]:
contours, hierarchy = cv.findContours(sakit2, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
area_array=[]
for index,contour in enumerate(contours):
  x,y,w,h = cv.boundingRect(contour)
  area = cv.contourArea(contour)
  area_array.append([index,area,w,h])

df_area =pd.DataFrame(area_array)
df_area.columns =['index','area','width','height']
df_sort=df_area.sort_values(by=['area'],ascending=False)
df_sort.head(50)

In [None]:
test = df_area['area']
array=[]
directory_save = '/content/drive/My Drive/Colab Notebooks/parkinsons/wave/training/parkinson'

for i,va in enumerate(test):
  if test[i] > 80000:
    array.append(i)

array.sort()
print(array)
os.chdir(directory_save)

for i, va in enumerate(array):
  x,y,w,h =cv.boundingRect(contours[va])
  wave= sakit[y:y+h, x:x+w]
  
  resized= cv.resize(wave,(512,220),interpolation = cv.INTER_AREA)
  cv.imwrite('wave{}.png'.format(i),resized)

print(os.listdir(directory_save))

## Load data baru

In [None]:
dataset_dir = '/content/drive/My Drive/Colab Notebooks/parkinsons/wave'

trainingPath = os.path.join(dataset_dir, "training")
testingPath = os.path.join(dataset_dir, "testing")

print(trainingPath)
print(testingPath)

In [None]:
resize_image_size = 128
extraction_method = 'hog'

(feature_train, label_train) = load_split(path = trainingPath, image_size= resize_image_size, extraction_method = extraction_method)
(feature_test, label_test) = load_split(path = testingPath, image_size= resize_image_size, extraction_method = extraction_method)

print("Data loaded!")
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)

## Training

In [None]:
logreg = logistic_regression(feature_train,label_train,feature_test,label_test)
svc = linear_svc(feature_train,label_train,feature_test,label_test)
rfc = random_forest(feature_train,label_train,feature_test,label_test)
knn = KNN(feature_train,label_train,feature_test,label_test)
mlc = MLP(feature_train,label_train,feature_test,label_test)

## Testing

In [None]:
# Testing Logistic Regression
images = test(testingPath,resize_image_size,logreg)

In [None]:
# Testing SVC
images = test(testingPath,resize_image_size,svc)

In [None]:
# Testing Random Forest
images = test(testingPath,resize_image_size,rfc)

In [None]:
# Testing KNN
images = test(testingPath,resize_image_size,knn)

In [None]:
# Testing MLC
images = test(testingPath,resize_image_size,mlc)

#Kesimpulan
Bahwa dengan menambahkan data gambar baru pada dataset dapat mempengaruhi hasil prediksi algoritma.

Pada hampir semua algoritma yang digunakan mengalami kenaikan score akurasi prediksi, kecuali pada RandomForest yang justru turun.

semakin banyak data yang digunakan untuk training maka algoritma dapat semakin banyak mempelajari variasi  fitur-fitur, sehingga tidak terjadi overfitting terhadap data latih dan akan dapat memprediksi data lain yang lebih kompleks