##ML Assignment 3 for Group-5
dataset used [link](https://www.kaggle.com/mathchi/diabetes-data-set)

In [1]:
# imports used in the code 

import pandas as pd
import numpy as np
import random as rd
import math
import statistics
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

# For splitting data
from sklearn.model_selection import train_test_split

# For normalizing data
from sklearn.preprocessing import MinMaxScaler
# PCA
from sklearn.decomposition import PCA

# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# SVM
from sklearn.svm import SVC

# For plotting purposes
import plotly.express as px

In [2]:
# Reading the data and creating dataframe
df = pd.read_csv("diabetes.csv")

# The data has several missing values, so we replace it with the average values of that feature
df.fillna(df.mean())

# printing the df
# df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# Slice the dataset into Features(X) and Final Outcome(Y)
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [4]:
# Normalize the features
X[X.columns] = MinMaxScaler().fit_transform(X)

In [5]:
# randomly split it into train, validation and test part.
# The ratio of the train, validation and test splits is 70 : 10 : 20 respectively.

def train_valid_test_split(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
  X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.125, random_state=1)
  return X_train, Y_train, X_val, Y_val, X_test, Y_test

Get the training, validation and Testing sets for operations:

In [6]:
# Get the training, validation and Testing sets for operations:

X_train, Y_train, X_valid, Y_valid, X_test, Y_test = train_valid_test_split(X,Y)

# since, random rows have been selected... we need to reset the indices so that the training, tests and validation X and Y
# are aligned w.r.t each other

X_train=X_train.reset_index(drop=True)
Y_train=Y_train.reset_index(drop=True)
X_valid=X_valid.reset_index(drop=True)
Y_valid=Y_valid.reset_index(drop=True)
X_test=X_test.reset_index(drop=True)
Y_test=Y_test.reset_index(drop=True)

### Using PCA for reducing Dimensions into 2-D feature space:


In [7]:
# Checking the size of X_train before applying PCA
print(f"X_train dimension before applying PCA = {X_train.shape}")

# Applying PCA such that the n_components are 2 and then to print the PCA in a 2-dimensional plane
pca = PCA(n_components=2)

# Fit the training set with PCA
X_train_pca = pca.fit_transform(X_train)

# Updated X_train after applying the PCA to reduce its dimensions
print(f"X_train dimension after applying PCA = {X_train_pca.shape}")

# See the variance ratio contributed by PC1 and PC2
print(f"Explained_variance_ratio = {pca.explained_variance_ratio_}")

# converting the PCA X back into Dataframe for computaion purposes
df_of_X_train_pca = pd.DataFrame(X_train_pca, columns = ["PC1", "PC2"])

# Concating the Y_train dataframe to get a combined dataframe with PC1, PC2 and the Y_train
df_of_train_scores_afterPCA = pd.concat([df_of_X_train_pca,Y_train], axis=1)

X_train dimension before applying PCA = (537, 8)
X_train dimension after applying PCA = (537, 2)
Explained_variance_ratio = [0.31411269 0.21153548]


In [8]:
df_of_train_scores_afterPCA

Unnamed: 0,PC1,PC2,Outcome
0,-0.095965,0.207448,0
1,-0.254032,0.068722,1
2,0.192237,-0.136709,1
3,-0.153368,-0.053380,1
4,-0.334383,-0.234876,0
...,...,...,...
532,0.163619,0.007829,1
533,-0.266531,0.078115,0
534,-0.167429,0.047636,0
535,-0.026553,-0.157012,0


Plotting the reduced dimensional data of the train split in a 2d plane:

In [9]:
PCA_plot = px.scatter(df_of_train_scores_afterPCA, x='PC1', y='PC2',
              color='Outcome')
PCA_plot.update_layout(template='plotly_white')

## Training SVM Classifier on the reduced dimension by PCA:

In [10]:
# create a list of hyperparameters 'C', 'gamma' and the 'degree' for a list of kernel values 
# to find out the combination giving the maximum accracy on the validation set

C_list=range(1,10,2)
gamma_list=range(1,6,2)
degree_list=range(1,6,2)
kernel_list=["linear", "poly", "rbf", "sigmoid"]

# Create a list to store the scores for different combintions
score_list=[]

# generating scores by iterating over the hyperparameters for different kernel types

for kernel in kernel_list:
  for C in C_list:
    for gamma in gamma_list:
        for degree in degree_list:
            dict={}
            dict["kernel"]=kernel
            dict["C"]=C
            dict["gamma"]=gamma
            dict["degree"]=degree

            # applying SVM classifier for the current hyperparameters
            model = SVC(C=C,kernel=kernel,gamma=gamma, degree = degree)
            model.fit(df_of_X_train_pca, Y_train)
            X_valid_pca = pca.transform(X_valid)

            # also storing the current model and its validation score as we will need to test the test_set 
            # on the model with the highest accuracy on X_validation
            dict["model"]=model
            dict["validation score"]=model.score(X_valid_pca, Y_valid)
            
            # update the score_list with all the current values
            score_list.append(dict)
# convert the list into a dataframe for better visualization
table=pd.DataFrame(score_list)

# getting the row with the maximum validation score
max_row = table.loc[table['validation score'].idxmax()]
print(f"Maximum validation score is obtained for \n: {max_row}")

# storing the best model in the list for testing the test_set
max_model=max_row.model

Maximum validation score is obtained for 
: kernel                                                           poly
C                                                                   3
gamma                                                               5
degree                                                              3
model               SVC(C=3, break_ties=False, cache_size=200, cla...
validation score                                             0.727273
Name: 61, dtype: object


In [11]:
# Printing all the cominations used and their scores in a table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(table.loc[:, table.columns != "model"])

Unnamed: 0,kernel,C,gamma,degree,validation score
0,linear,1,1,1,0.688312
1,linear,1,1,3,0.688312
2,linear,1,1,5,0.688312
3,linear,1,3,1,0.688312
4,linear,1,3,3,0.688312
5,linear,1,3,5,0.688312
6,linear,1,5,1,0.688312
7,linear,1,5,3,0.688312
8,linear,1,5,5,0.688312
9,linear,3,1,1,0.701299


SVM result on the test set:

In [12]:
# Calculating accuracy on the test set based on the max validation parameters :
X_test_pca = pca.transform(X_test)
max_model.score(X_test_pca, Y_test)

0.6818181818181818

### Using LDA for reducing Dimensions:


In [13]:
lda = LinearDiscriminantAnalysis(n_components=1)

# Fit the training set with PCA
X_train_lda = lda.fit(X_train, Y_train).transform(X_train)

# Updated X_train dimensions after applying the LDA to reduce its dimensions
print(f"X_train dimension after applying LDA = {X_train_lda.shape}")

# See the variance ratio contributed by LD1
print(f"Explained_variance_ratio = {lda.explained_variance_ratio_}")

# converting the X_train_LDA back into Dataframe for computaion purposes
df_of_X_train_lda = pd.DataFrame(X_train_lda, columns = ["LD1"])

# Concating the Y_train dataframe to get a combined dataframe with LD1 and the Y_train
df_of_train_scores_afterLDA = pd.concat([df_of_X_train_lda,Y_train], axis=1)

X_train dimension after applying LDA = (537, 1)
Explained_variance_ratio = [1.]


In [14]:
df_of_train_scores_afterLDA

Unnamed: 0,LD1,Outcome
0,0.723125,0
1,-1.087577,1
2,1.454740,1
3,0.754996,1
4,-0.302462,0
...,...,...
532,-0.213644,1
533,0.170812,0
534,-0.895499,0
535,-1.080423,0


Plotting the reduced dimensional data of the train split in a 2d plane:

In [15]:
LDA_plot = px.scatter(df_of_train_scores_afterLDA, x='LD1', y='LD1',
              color='Outcome', opacity = 0.4)
LDA_plot.update_layout(template='plotly_white')

## Training SVM Classifier on the reduced dimension by LDA:

In [16]:
# create a list of hyperparameters 'C', 'gamma' and the 'degree' for a list of kernel values 
# to find out the combination giving the maximum accracy on the validation set

C_list=range(1,10,2)
gamma_list= np.arange(0.1,1,0.2)
degree_list=range(1,6,2)
kernel_list=["linear", "poly", "rbf", "sigmoid"]
# Create a list to store the scores for different combintions
score_list=[]

# generating scores by iterating over the hyperparameters for different kernel types

for kernel in kernel_list:
  for C in C_list:
    for gamma in gamma_list:
        for degree in degree_list:
            dict={}
            dict["kernel"]=kernel
            dict["C"]=C
            dict["gamma"]=gamma
            dict["degree"]=degree

            # applying SVM classifier for the current hyperparameters
            model = SVC(C=C,kernel=kernel,gamma=gamma, degree = degree)
            model.fit(df_of_X_train_lda, Y_train)
            X_valid_lda = lda.transform(X_valid)

            # also storing the current model and its validation score as we will need to test the test_set 
            # on the model with the highest accuracy on X_validation
            dict["model"]=model
            dict["validation score"]=model.score(X_valid_lda, Y_valid)

            # update the score_list with all the current values
            score_list.append(dict)
# convert the list into a dataframe for better visualization
table=pd.DataFrame(score_list)

# getting the row with the maximum validation score
max_row = table.loc[table['validation score'].idxmax()]
print(f"Maximum validation score is obtained for \n: {max_row}")

# storing the best model in the list for testing the test_set
max_model=max_row.model

Maximum validation score is obtained for 
: kernel                                                           poly
C                                                                   1
gamma                                                             0.1
degree                                                              1
model               SVC(C=1, break_ties=False, cache_size=200, cla...
validation score                                             0.792208
Name: 75, dtype: object


In [17]:
# Printing all the cominations used and their scores in a table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(table.loc[:, table.columns != "model"])

Unnamed: 0,kernel,C,gamma,degree,validation score
0,linear,1,0.1,1,0.779221
1,linear,1,0.1,3,0.779221
2,linear,1,0.1,5,0.779221
3,linear,1,0.3,1,0.779221
4,linear,1,0.3,3,0.779221
5,linear,1,0.3,5,0.779221
6,linear,1,0.5,1,0.779221
7,linear,1,0.5,3,0.779221
8,linear,1,0.5,5,0.779221
9,linear,1,0.7,1,0.779221


SVM result on the test set:

In [18]:
# Calculating accuracy on the test set based on the max validation parameters :
X_test_lda = lda.transform(X_test)
max_model.score(X_test_lda, Y_test)

0.7857142857142857