In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from time import time
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# First of all, load the csv brain data with the significant variables
data = pd.read_csv("FAMatrixSignificantData.csv") 
data.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column194,Column195,Column196,Column197,Column198,Column199,Column200,Column201,Column202,Column203
0,19.78399,19.31447,19.142825,19.185161,13.499028,17.694012,8.756266,19.141286,19.187837,14.834598,...,1.486842,1.131579,1.276316,1.342105,1.605263,1.513158,1.421053,1.460526,2.978577,ms
1,16.733469,12.910102,11.812794,17.734108,11.060022,11.744588,7.687872,19.032041,14.390918,10.71753,...,1.657895,1.065789,1.289474,1.381579,1.657895,1.381579,1.552632,1.578947,3.625088,ms
2,18.424258,15.705767,16.17419,19.745516,15.631004,12.928705,9.509393,20.030737,17.674618,13.425871,...,1.631579,1.184211,1.315789,1.5,1.684211,1.407895,1.539474,1.513158,3.178369,ms
3,19.5911,22.772947,11.736065,19.648423,14.513728,15.147721,5.71118,21.832772,20.116815,14.765563,...,1.684211,1.118421,1.302632,1.486842,1.592105,1.407895,1.434211,1.539474,3.095072,ms
4,18.533265,15.463072,12.038137,16.380761,14.586402,15.148913,11.773108,16.310207,18.340773,14.91316,...,1.644737,1.157895,1.315789,1.407895,1.552632,1.578947,1.486842,1.565789,3.327974,ms


In [3]:
# Separate the dataframe into nodes_properties (x) and classes (y)
nodes_properties = data.iloc[:,0:len(data.columns)-1]
print('Properties dimensions: ', nodes_properties.shape)

classes = data.iloc[:,len(data.columns)-1]
classes_names = classes.unique().tolist()
n_classes = len(classes_names)
print('Classes dimensions: ', classes.shape)
print('There are {} classes: {}'.format(n_classes, classes_names))

Properties dimensions:  (239, 202)
Classes dimensions:  (239,)
There are 2 classes: ['ms', 'hv']


In [4]:
# Divide the dataframe in the test and train sets, with 80% of the data in the train group and 20% in the test set.
# We will stratify using classes, so that there is the same proportion of each class in the test/train sets as in the original
# dataset
X_train, X_test, y_train, y_test = train_test_split(nodes_properties, classes, test_size=0.2, random_state=2019, stratify=classes)


# K-NN

#### Hyperparameter tuning

In [5]:
# First of all, create the classifier
knn = KNeighborsClassifier()

# Then, the hyperparameters to be optimized are defined 
# k = number of nearest neighbours considered
# weights = weight function used in prediction. If it is 'uniform', all the neighbours will have the same importance. If it is 
# 'distance', closer neighbours will have more influence. 

k_range = list(range(1, 21))
weight_names = ['uniform', 'distance']

param_grid = dict(n_neighbors=k_range, weights = weight_names)

# Now create the grid search with 4 folds for cross-validation
grid = GridSearchCV(knn, param_grid, cv=4)

# Apply the grid to the data checking the time 
start = time()
grid.fit(X_train, y_train)
end = time()

# Check the results
print("The search took {} seconds".format(end - start))
print(" ")

ranks = list(grid.cv_results_['rank_test_score'])
means = list(grid.cv_results_["mean_test_score"])
stds = list(grid.cv_results_["std_test_score"])
params = list(grid.cv_results_['params'])

results = zip (ranks, means, stds, params)
for rank, mean, std, params in results:
    if rank == 1: #If the rank of the result is one, print it
        print("Rank : {}. Mean accuracy {:.4f} +/- {:.4f}. Parameters: {}".format(rank, mean*100, std*100, params))


The search took 2.965538501739502 seconds
 
Rank : 1. Mean accuracy 83.7696 +/- 2.3851. Parameters: {'n_neighbors': 10, 'weights': 'distance'}
Rank : 1. Mean accuracy 83.7696 +/- 4.1862. Parameters: {'n_neighbors': 15, 'weights': 'distance'}


##### Model training with best parameters

In [6]:
# First create and fit the model with the best hyperparameters
knn = KNeighborsClassifier(n_neighbors=grid.best_params_["n_neighbors"], weights=grid.best_params_["weights"])

knn.fit(X_train, y_train)

# Predict the test values
y_predicted = knn.predict(X_test)

# Get the accuracy of the model
# La precisión del modelo es la siguiente:
accuracy = accuracy_score(y_test, y_predicted)

# Print the results
print("K-NN Algorithm results")
print("-----------------------")
print("Optimal k: {}".format(grid.best_params_["n_neighbors"]))
print("Optimal weights: {}".format(grid.best_params_["weights"]))
print("Accuracy in the test set: {:.2f}%".format(accuracy*100))

K-NN Algorithm results
-----------------------
Optimal k: 10
Optimal weights: distance
Accuracy in the test set: 83.33%


# SVM