<a href="https://colab.research.google.com/github/rubencg195/Pytorch-Tutorials/blob/master/Confusion_Matrix_K_Fold_CrossValidation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 2

In [0]:
#Console Parameters
import sys 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Machine Learning
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.metrics import explained_variance_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score,  precision_recall_curve, average_precision_score, mean_squared_error
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
  
#Utils - Math and Plotting
import matplotlib.pyplot as plt
from random import randrange
import pandas as pd
import numpy as np

In [0]:
#Hyperparameters
debug              = False
varianceThreshold  = 0.0005
# varianceThreshold  = 0.001
K                  = 17
FoldSize           = 10

In [0]:
def loadDataset(filename):
  print("Loading Dataset (Process 1 of 7)")
  df = pd.read_csv( filename ,sep='\t', header=None)
  data = np.array( df.as_matrix() )
  if debug:
    print(df.head())
    print("\nRaw_Data_Shape\t{}\tNumber_of_Features\t{}".format(df.shape, len(df)) )
  return data

In [0]:
def OrderByVariance(data):
  print("Order and Filter By Variance (Process 2 of 7)")
  #Take Out Labels from data
  features_without_label = data[: , :-1]
  #Separate Labels in a different array
  labels                 = data[: , -1]
  #Get Variance per Column
  variance               = np.var(features_without_label,0)
  #Sort Variance per Column From Bigger To Smaller
  new_index_by_var_sort  = np.argsort(variance)[::-1]                
  selector               = VarianceThreshold(varianceThreshold) 
  #Filter Columns, Delete all columns with variance lesser than threshhold
  data                   = selector.fit_transform(data[:,new_index_by_var_sort])
  #append again the labels
  data                   = np.c_[data, labels ]
  if debug:
    print("\n\nfeatures_without_label\t{}\tlabels\t{}".format(features_without_label.shape, labels.shape))
    print("\n\nTable of Variance Per Feature\n\n",       pd.DataFrame([variance], index=["Variance"]))
    print("\nNew Order Based on Features with Higher Variance to lower\n",
    pd.DataFrame([
        variance[new_index_by_var_sort]
    ], columns=new_index_by_var_sort,
    index=["Variance"]))
    print("\nVariance Filtered Data (with Label Col.) \n", pd.DataFrame(data).head())
  return data

In [0]:
def CrossValidationSplit(dataset, folds=10):
  # Divide the dataset D pseudo-randomly into V folds
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset) / folds)
  for i in range(folds):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split

In [0]:
def EqualyDistributePositiveAndNegativeAndSplit(data):
  print("Cross Validation Split (Process 3 of 7)")
  #Separate Positives & Negatives
  p_indexes      = (data[:,-1] == 1 )
  n_indexes      = (data[:,-1] == 0 )
  data_positives = data[p_indexes , :]
  data_negatives = data[n_indexes,  :]
  #Cross Validation to positives and negatives
  p_folds         = np.array(CrossValidationSplit(data_positives, FoldSize))
  n_folds         = np.array(CrossValidationSplit(data_negatives, FoldSize))
  n_rows_per_fold = p_folds.shape[1] + n_folds.shape[1]
  #Empty Array To Store Folds
  folds = np.array([]).reshape(0,n_rows_per_fold, data.shape[1])
  #Join in equaly distributed way, positives and negatives
  for i in range(FoldSize):
    f = np.r_[p_folds[i] , n_folds[i] ] 
    #Shuffle features
    np.random.shuffle(f)
    folds = np.append( folds ,[f], axis=0 )
  if debug:
    print("\nPositives\t{}".format(    data_positives.shape) )
    print("Negatives\t{}".format(data_negatives.shape))
    print("Folds_Positives\t{}\nFolds_Negatives\t{}\nFolds_Shuffled_and_Joined\t{}".format(p_folds.shape, n_folds.shape, folds.shape))
  return folds, n_rows_per_fold

In [0]:
def CrossValidation(folds, data, K):
  print("Cross Validation-Selecting Best Model (Process 4 of 7)")
  bestPerFold = np.array([])
  bestValues  = {
      "fSelectedIndexes": np.array([0], dtype=int),  
      "Precision" :       0.0,
      "Accuracy"  :       0.0,
      "mse"       :       9999.0
  }
  if(debug):
    print("\n\nStarting Cross Validation\n\n")
  for i, f in enumerate(folds): 
    dataset_copy         =  list(folds)
    #Define set T as the I-th fold of the dataset D
    test_fold            =  dataset_copy.pop(i)  
    #Define set L as the dataset D without the I-th fold
    dataset_without_fold =  dataset_copy       
    dataset_without_fold =  np.array(dataset_without_fold).reshape((-1, data.shape[1]))
    #Iterate through all the features to select only the bests
    for feature_index in range(1, dataset_without_fold.shape[1] - 1) : 
      #Exclude the last column because is labels
      #Append the next feature index to be tested
      bestEpisodeValues = {
        "fSelectedIndexes": np.append( bestValues["fSelectedIndexes"], feature_index ),   
        "Precision" :       0.0,
        "Accuracy"  :       0.0,
        "mse"       :       9999.0
      }
      #Create Dataset Only With The Testing Feature Indexes
      X = dataset_without_fold[:, bestEpisodeValues["fSelectedIndexes"] ]
      y = dataset_without_fold[:, -1].reshape(-1)
      test_data       = test_fold[:, bestEpisodeValues["fSelectedIndexes"] ]
      test_true_label = test_fold[:, -1].reshape(-1)
      #Iterate through all the possible K values in every Feature Test and every Fold Test
      for k in range(3, K, 2):
        clf                           = KNeighborsClassifier(n_neighbors=k)
        training_acc                  = clf.fit(X, y).score(X, y)
        # predictions                   = clf.predict(test_data)
        predictions                   = clf.predict_proba(test_data)
        accuracy                      = accuracy_score(test_true_label, predictions)
        precision, recall, thresholds = precision_recall_curve( test_true_label, predictions)
        ave_precision                 = average_precision_score(test_true_label, predictions)
        mse                           = mean_squared_error(test_true_label, predictions)
        #If the accuracy for this model is better than the previous, replace
        if( accuracy > bestEpisodeValues["Accuracy"] ):
          bestEpisodeValues["k"]                = k 
          bestEpisodeValues["featureIndex"]     = feature_index 
          bestEpisodeValues["testFoldIndex"]    = i
          bestEpisodeValues["Precision"]        = ave_precision
          bestEpisodeValues["Accuracy"]         = accuracy 
          bestEpisodeValues["Recall"]           = recall
          bestEpisodeValues["mse"]              = mse 
      #If the model selected for this testing set of features is better than the previous
      #Replace K value and append new feature index, in the list of best features
      if( bestEpisodeValues["Accuracy"] > bestValues["Accuracy"] ):
        bestValues["k"]                = bestEpisodeValues["k"]  
        if bestEpisodeValues["featureIndex"] not in bestValues["fSelectedIndexes"]:
          bestValues["fSelectedIndexes"] =   np.append(bestValues["fSelectedIndexes"], bestEpisodeValues["featureIndex"] ) 
        bestValues["featureIndex"]     = bestEpisodeValues["featureIndex"] 
        bestValues["testFoldIndex"]    = bestEpisodeValues["testFoldIndex"] 
        bestValues["Precision"]        = bestEpisodeValues["Precision"] 
        bestValues["Accuracy"]         = bestEpisodeValues["Accuracy"] 
        bestValues["Recall"]           = bestEpisodeValues["Recall"] 
        bestValues["mse"]              = bestEpisodeValues["mse"] 
    if(debug):
      print("FoldCounter\t{}\tSelected_K\t{}\tSelected_Features\t{}\tModel_Acc\t{:.5f}".format(
          i,
          bestValues["k"] , 
          bestValues["fSelectedIndexes"] , 
          bestValues["Accuracy"]
      ))
    #Save the best one per fold, for debugging purposes
    bestPerFold = np.append(bestPerFold, bestValues.copy)  
  return bestValues

In [0]:
def PlotCurves(folds, bestModel, data ):
  print("Ploting Precision-Recall & ROC Curve (Process 6 of 7)")
  #Global Containers
  ave_acc = np.array([])
  ave_pre = np.array([]).reshape(0,3)
  ave_rec = np.array([]).reshape(0,3)
  ave_roc = np.array([]).reshape(0,3)
  ave_mse = np.array([])
  tot_ave_pre = np.array([])
  ave_metrics = np.array([]).reshape(0,4)
  #GraphPointers
  pre_rec_fig = plt.figure()
  roc_fig     = plt.figure()
  pre_rec_ax = pre_rec_fig.gca()
  roc_ax     = roc_fig.gca()
  #Iterate all Folds
  for i, f in enumerate(folds):
    #Prepare Data as in CV function, but this time with the Selected Best Features
    dataset_copy         =  list(folds)
    test_fold            =  dataset_copy.pop(i)  
    dataset_without_fold =  dataset_copy         
    dataset_without_fold =  np.array(dataset_without_fold).reshape((-1, data.shape[1]))
    X                    = dataset_without_fold[:, bestModel["fSelectedIndexes"] ]
    y                    = dataset_without_fold[:, -1].reshape(-1)
    test_data            = test_fold[:, bestModel["fSelectedIndexes"] ]
    test_true_label      = test_fold[:, -1].reshape(-1)
    #Train Per Fold with Best Features to get Average Scores
    clf                           = KNeighborsClassifier(n_neighbors=bestModel["k"])
    clf.fit(X, y) 
    predictions                   = clf.predict(test_data)
    #Scores
    accuracy                      = accuracy_score(test_true_label, predictions)
    precision, recall, thresholds = precision_recall_curve( test_true_label, predictions)
    fpr, tpr, thresholds          = roc_curve( test_true_label, predictions)
    roc_auc                       = auc(fpr, tpr)
    ave_precision                 = average_precision_score(test_true_label, predictions)
    mse                           = mean_squared_error(test_true_label, predictions)
    tn, fp, fn, tp                = confusion_matrix(test_true_label, predictions).ravel()
    #Plots Per Folds
    pre_rec_ax.plot(precision, recall, lw=1, alpha=0.3,
              label='Precision-Recall fold %d (Ave. Precision = %0.2f)' % (i, ave_precision))
    roc_ax.plot(fpr, tpr, lw=1, alpha=0.3,
              label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    #Append to Global Containers
    ave_acc     = np.append(ave_acc, accuracy )
    ave_pre     = np.r_[ave_pre, np.array([precision]) ]
    ave_rec     = np.r_[ave_rec, np.array([recall])    ]
    tot_ave_pre = np.append(tot_ave_pre, ave_precision )
    ave_mse     = np.append(ave_mse, mse )
    ave_metrics = np.r_[ave_metrics, np.array([[ tn, fp, fn, tp ]])] 
    if(debug):
      print("Fold#{}\taccuracy\t{:5f}\tprecision\t{}".format(i, accuracy, tot_ave_pre))
  # if(debug):
  #   print("\n\nAverage_Accuracy\t{:5f}\tAverage_Precision\t{}\tAverage_Recall\t{}\tAverage_MSE\t{}\tAve_Metrics\t{}\n\n".format(
  #       ave_acc.mean(), ave_pre.mean(0), ave_rec.mean(0), ave_mse.mean(),ave_metrics.mean(0)  ))
  pre_rec_fig.legend(loc="upper right")
  pre_rec_ax.set_xlabel('Recall')
  pre_rec_ax.set_ylabel('Precision')
  pre_rec_ax.set_title('Precision-Recall Curve')
  roc_fig.legend(loc="lower right")
  roc_ax.set_xlabel('False Positive Rate')
  roc_ax.set_ylabel('True Positive Rate')
  roc_ax.set_ylabel('ROC Curve')
  return ave_metrics, ave_acc, ave_pre, ave_rec, tot_ave_pre, ave_mse


In [0]:
def PlotKMapMetrics(folds, bestModel, data):
  print("Rendering Confusion Matrix (Process 7 of 7)")
  k_fig = plt.figure()
  k_ax     = k_fig.gca()
  #Data Preparation
  k_range = range(3, K, 2) 
  dataset_copy         =  list(folds)
  #Testing Using Fold 0 as Test Data
  test_fold            =  dataset_copy.pop(0)  
  dataset_without_fold =  dataset_copy         
  dataset_without_fold =  np.array(dataset_without_fold).reshape((-1, data.shape[1]))
  X                    = dataset_without_fold[:, bestModel["fSelectedIndexes"] ]
  y = dataset_without_fold[:, -1].reshape(-1)
  test_data            = test_fold[:, bestModel["fSelectedIndexes"] ]
  test_true_label      = test_fold[:, -1].reshape(-1)
  #Average Data Containers
  all_train_acc = np.array([])
  all_test_acc = np.array([])
  #Iterate all K's using the Selected Features and Fold 0 as Test Data
  for k in k_range:
    clf            = KNeighborsClassifier(k)
    train_acc      = clf.fit(X, y).score(X, y) 
    predictions    = clf.predict(test_data)
    test_acc       = accuracy_score(test_true_label, predictions)
    all_train_acc = np.append(all_train_acc, train_acc)
    all_test_acc  = np.append(all_test_acc , test_acc)
  k_ax.plot( k_range , all_train_acc)
  k_ax.plot( k_range , all_test_acc)
  k_fig.legend(('Train', 'Test'), loc='upper right')
  k_ax.set_ylabel('Accuracy')
  k_ax.set_xlabel('K Value')
  # k_ax.set_xlim(0, 1)
  # k_ax.set_ylim(0, 1)
  k_ax.set_title('Possible K´s with selected features: {}  Final K: {}'.format(bestModel["fSelectedIndexes"], bestModel["k"] ))


In [0]:
def ConfusionMatrix(tn, fp, fn, tp):
  conf_fig    = plt.figure()
  conf_ax     = conf_fig.gca()
  data = [ [tp, fp], [fn, tn] ]
  columns = ('Positive', 'Negative')
  rows    = ('Positive', 'Negative')
  table = conf_ax.table(
    cellText=data,
    rowLabels=rows,
    colLabels=columns,
    loc='top')
  conf_fig.tight_layout()
  conf_fig.patch.set_visible(False)
  conf_ax.axis('off')
  conf_ax.axis('tight')
  # conf_ax.set_ylabel('Predicted')
  # conf_ax.set_xlabel('True Classes')
  # conf_ax.set_title("Confusion Matrix")
  margins = { "top" : 0.863, "bottom" : 0.081, "left": 0.141, "right":0.977, "hspace":0.31, "wspace":0.255}
  conf_fig.subplots_adjust(**margins)

  print("\n\n\n\nConfusion Matrix\n")
  df = pd.DataFrame({
      "Predicted": ["Positive", "Negative" ],
      "Positive": [tp, fp],
      "Negative": [fn, tn]
  })
  df.set_index("Predicted",drop=True,inplace=True)
  print("\t\tTrue Class\n",df.to_string())

In [0]:
# filename               = sys.argv[1]                 #Import data filename
filename               = "A2_t2_dataset.tsv"                 #Import data filename

In [16]:
#Upload dataset
from google.colab import files
import os  

def upload(filename):

  if not (os.path.isfile(filename)):
    uploaded = files.upload()

    for fn in uploaded.keys():
      print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(uploaded[fn])))
  else:
    print(filename+" on drive")

upload(filename)   

A2_t2_dataset.tsv on drive


In [17]:
data                   = loadDataset(filename)

Loading Dataset (Process 1 of 7)


In [18]:
data                   = OrderByVariance(data)

Order and Filter By Variance (Process 2 of 7)


In [19]:
folds, n_rows_per_fold = EqualyDistributePositiveAndNegativeAndSplit(data)

Cross Validation Split (Process 3 of 7)


In [20]:
bestModel              = CrossValidation(folds, data, K)

Cross Validation-Selecting Best Model (Process 4 of 7)


ValueError: ignored

In [0]:
ave_metrics,_,_,_,_,_  = PlotCurves(     folds, bestModel, data )            #Ploting
tn, fp, fn, tp         = ave_metrics.mean(0)

In [0]:
PlotKMapMetrics(folds, bestModel, data)

In [0]:
ConfusionMatrix( tn, fp, fn, tp)

In [0]:
print("\n\nBest_Model\tK_Value\t{}\tSelected_Features\t{}\tAccuracy\t{}".format(
  bestModel["k"], bestModel["fSelectedIndexes"], bestModel["Accuracy"]))
plt.show()