# Actividad 3 - Clasificación rayos X

#### Andrés Espinosa, Francisco Rencoret, Raimundo Herrera, Raimundo Manterola

Comenzamos por importar las librerias que utilizaremos

In [1]:
import numpy as np
from os import listdir
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA, LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA

In [2]:
from pybalu.feature_extraction import lbp_features
from pybalu.feature_selection import clean, sfs
from pybalu.feature_transformation import normalize
from pybalu.io import imread

## Parte 1

Leemos los datos e importamos als imagenes

In [3]:
images = sorted([i for i in listdir('xray') if i.endswith('png')])

In [4]:
data = np.array([[np.expand_dims(imread(f'xray/{i}'), 0), int(i[5:9])] for i in images])

In [5]:
X = data[:, 0]
y = data[:, 1].astype(int)

In [6]:
X = np.concatenate(X)

Separamos y estratificamos nuestro set de datos en set de train y test.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

## Parte 2

Calculamos los features lbp sin particiones ni invariancia rotacional para nuestro set de train y test.

In [8]:
X_lbp_train = np.array([lbp_features(img, mapping='nri_uniform', vdiv=1, hdiv=1) for img in X_train])
X_lbp_test = np.array([lbp_features(img, mapping='nri_uniform', vdiv=1, hdiv=1) for img in X_test])

Entrenamos un clasificador de K vecinos cercanos con k=5.

In [9]:
knn = KNeighborsClassifier(n_neighbors=5)

In [10]:
knn.fit(X_lbp_train, y_train)
knn.score(X_lbp_test, y_test)

0.86

Obtenemos un Accuracy de 86% sobre el set de test.

## Parte 3

In [11]:
from pybalu.feature_extraction import gabor_features, haralick_features

Comenzamos por normalizar los datos.

In [12]:
X_train_norm, a, b = normalize(X_train)
X_test = X_test * a + b

Calculamos los features de Haralick, lbp invariante a la rotacion, y lbp invariante a la rotacion con 4 particiones.

In [13]:
X_haralick_train = np.array([haralick_features(img.astype(int)) for img in X_train])
X_lbp_ri_train = np.array([lbp_features(img, mapping='uniform', vdiv=1, hdiv=1) for img in X_train])
X_lbp_ridiv_train = np.array([lbp_features(img, mapping='uniform', vdiv=4, hdiv=4) for img in X_train])

In [14]:
X_haralick_test = np.array([haralick_features(img.astype(int)) for img in X_test])
X_lbp_ri_test = np.array([lbp_features(img, mapping='uniform', vdiv=1, hdiv=1) for img in X_test])
X_lbp_ridiv_test = np.array([lbp_features(img, mapping='uniform', vdiv=4, hdiv=4) for img in X_test])

Concatenamos todos estos features en un solo arreglo.

In [15]:
X_train_features = np.concatenate(
    (X_haralick_train, X_lbp_ri_train, X_lbp_ridiv_train),
    axis=1
)

Separamos el set de train en train y validacion. Ocuparemos validacion para detectar los mejores hiperparámetros.

In [16]:
X_train_all, X_val_all, y_train_all, y_val_all = train_test_split(
    X_train_features, y_train, stratify=y_train, test_size=0.25
)

Creamos una función por cada uno de los clasificadores que entrenaremos.

In [17]:
def evaluate_knn(X_train, y_train, X_val, y_val, n_neighbours=5):
    knn = KNeighborsClassifier(n_neighbours)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred) * 100)
    return accuracy

In [18]:
def evaluate_mlp(X_train, y_train, X_val, y_val):
    clfNN = MLPClassifier(activation='logistic', learning_rate_init=0.0001 , hidden_layer_sizes=(1000), max_iter=4000)
    clfNN.fit(X_train, y_train)
    y_pred = clfNN.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred) * 100) 
    return accuracy

In [19]:
def evaluate_svm(X_train, y_train, X_val, y_val):
    clfSvm = svm.SVC(kernel='poly', C=0.5, degree=4, gamma='auto')
    clfSvm.fit(X_train, y_train)
    y_pred = clfSvm.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred) * 100)
    return accuracy


In [20]:
def evaluate_lda(X_train, y_train, X_val, y_val):
    neigh = LDA()
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_val)
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    return accuracy

In [21]:
def evaluate_qda(X_train, y_train, X_val, y_val):
    neigh = QDA()
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_val)
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    return accuracy

In [22]:
X_train_all.shape

(300, 198)

Sacamos entonces 198 features por cada muestra, por la concatenación de los distintos LBP y las features de Haralick

In [23]:
pca = PCA(n_components=20)
pca.fit(X_train_all, y_train_all)
X_train_pcaed = pca.transform(X_train_all)
X_val_pcaed = pca.transform(X_val_all)

Evaluamos cada uno de nuestros modelos sobre el set de validación para quedarnos con el mejor.

In [24]:
evaluate_knn(X_train_all, y_train_all, X_val_all, y_val_all)

81.0

In [25]:
evaluate_knn(X_train_pcaed, y_train_all, X_val_pcaed, y_val_all)

82.0

In [26]:
evaluate_mlp(X_train_all, y_train_all, X_val_all, y_val_all)

94.0

In [27]:
evaluate_mlp(X_train_pcaed, y_train_all, X_val_pcaed, y_val_all)

87.0

In [28]:
evaluate_svm(X_train_all, y_train_all, X_val_all, y_val_all)

95.0

In [29]:
evaluate_svm(X_train_pcaed, y_train_all, X_val_pcaed, y_val_all)

70.0

In [30]:
evaluate_lda(X_train_all, y_train_all, X_val_all, y_val_all)



18.0

In [31]:
evaluate_lda(X_train_pcaed, y_train_all, X_val_pcaed, y_val_all)

19.0

In [32]:
evaluate_qda(X_train_all, y_train_all, X_val_all, y_val_all)



16.0

In [33]:
evaluate_qda(X_train_pcaed, y_train_all, X_val_pcaed, y_val_all)

18.0

Podemos ver que los mejores resultados fueron el MLP y SVM, con un 94% y 95% respectivamente. Estos resultados variaron un poco de acuerdo a la ejecución, por lo que nos quedamos con ambos.

## Parte 4

No es lo suficientemente estable de acuerdo a la randomización de la división de los sets de datos, o de los estados iniciales de los clasificadores que parten con pesos aleatorios. pero en cada ejecución varían los resultados alrededor de un 2-3% en los mejores casos y 5-10% en los peores