In [1]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import numpy as np
import math
import matplotlib.pyplot as plt
import os


In [2]:
confusion_avg = np.zeros([2,2]) 
l1=[]
l2=[]
l3=[]

In [3]:
def normpdf(x, mean, sd):
    var = float(sd)**2
    denom = (2*math.pi*var)**.5
    num = math.exp(-(float(x)-float(mean))**2/(2*var))
    return num/denom


def reduce_dim(x_train,y_train):
    lda = LDA(n_components=1)
    x_train_lda = lda.fit_transform(x_train, y_train)
    x_test_lda = lda.transform(x_test)
    
    return x_train_lda,x_test_lda

def mean_std(x_train_lda,y_train):
    m0=0 # 0 = benign , 1 = malignant
    m1=0
    s0=0
    s1=0


    mal=ben=0
    for i in y_train:
        if i==0:
            ben+=1
        else:
            mal+=1

    benign=np.zeros(shape=(ben))
    malignant= np.zeros(shape=(mal))    
    m=0
    n=0

    for i in range(len(y_train)):
        if y_train[i]==0:
            benign[m]=x_train_lda[i]
            m+=1
        else:
            malignant[n]=x_train_lda[i]
            n+=1
    m0=np.mean(benign)
    m1=np.mean(malignant)
    s0=np.std(benign)
    s1=np.std(malignant)
    
    return m0,m1,s0,s1,ben,mal

def predict(x_test_lda,m0,s0,m1,s1,ben,mal):
    y_predicted=np.zeros(len(x_test_lda))

    for i in range(len(y_predicted)):     
        pb=normpdf(x_test_lda[i],m0,s0)*((ben)/(ben+mal))
        pm=normpdf(x_test_lda[i],m1,s1)*((mal)/(ben+mal))
      
        if pb>pm:
            y_predicted[i]=0
        else:
            y_predicted[i]=1

    return y_predicted
    
    
def confusion_matrix(y_predicted,y_test):
    confusion=np.zeros((2,2))
    tn=tp=fn=fp=0
    
    success=failure=0
    for i in range(len(y_predicted)):
        if y_predicted[i]==y_test[i]:
            success=success+1
        else:
            failure +=1
    for i in range(len(y_predicted)):
        if y_predicted[i]==0 and y_test[i]==0:
            tn+=1
        elif  y_predicted[i]==1 and y_test[i]==1:
            tp+=1
        elif  y_predicted[i]==0 and y_test[i]==1:
            fn+=1
        elif  y_predicted[i]==1 and y_test[i]==0:
            fp+=1
    confusion[0,0]=tn 
    confusion[0,1]=fp
    confusion[1,0]=fn
    confusion[1,1]=tp
    accuracy=(tp+tn)/len(y_predicted)
    precision=tp/(tp+fp)
    recall=tp/(tp+fn)
    
    return confusion, accuracy, precision, recall
                

In [4]:
N=10
for count in range(N):
    print("ITERATION :", count+1)

    working_directory = os.getcwd()
    path = working_directory + '/Data set.csv'   ##initialising path to data file, rename to fit in with your pc
    df = pd.read_csv(path)
    df.drop(columns='id',inplace = True)    #dropping column id as it might give incorrect correlations
    
    for col in df.columns:    #filling in the missing values with the mean of that column
        if col!= 'diagnosis':
            mean=df[col].mean()
            df[col] = df[col].fillna(mean)
        else:
            df.loc[df[col]== 'M',col]= 1 #benign =0 ,malignant=1
            df.loc[df[col]== 'B',col] = 0

    for column in df.columns:   #normalization
        if column != 'id' and column!= 'diagnosis':
            df[column] = (df[column] - df[column].mean()) / df[column].std()

    df = df.sample(frac=1) #shuffles the data
    df2=df   #copying the row shuffled df to df2


    ratio = 0.67
    total_rows = df.shape[0]
    train_size = int(total_rows*ratio)
    train = df[0:train_size]
    test = df[train_size:]

    target='diagnosis'
    y_train=train[target]
    y_test=test[target]
    y_train=y_train.astype('int')
    y_test=y_test.astype('int')
    train.drop(columns=target,axis=1)
    test.drop(columns=target,axis=1)
    x_train=train
    x_test=test


    y_train=np.array(y_train)
    y_test=np.array(y_test)

    x_train_lda,x_test_lda=reduce_dim(x_train,y_train)

    m0,m1,s0,s1,ben,mal= mean_std(x_train_lda,y_train)

    y_predicted= predict(x_test_lda,m0,s0,m1,s1,ben,mal)

    confusion, accuracy, precision, recall= confusion_matrix(y_predicted,y_test)
    print("FLDM 1 ")
    print("")
    print("CONFUSION MATRIX")
    print(confusion)
    print("ACCURACY =", accuracy)
    print("PRECISION =", precision)
    print("RECALL =", recall)
    print("")

    ##shuffling the columns and running the same algorithm

    #FLDM2

    print("FLDM 2 : SHUFFLING THE COLUMNS")
    print("")
    df2=df2.sample(frac=1, axis=1)
    ratio = 0.67
    total_rows = df2.shape[0]
    train_size = int(total_rows*ratio)
    train = df2[0:train_size]
    test = df2[train_size:]

    target='diagnosis'
    y_train=train[target]
    y_test=test[target]
    y_train=y_train.astype('int')
    y_test=y_test.astype('int')
    train.drop(columns=target,axis=1)
    test.drop(columns=target,axis=1)
    x_train=train
    x_test=test

    y_train=np.array(y_train)
    y_test=np.array(y_test)



    x_train_lda,x_test_lda=reduce_dim(x_train,y_train)

    m0,m1,s0,s1,ben,mal= mean_std(x_train_lda,y_train)

    y_predicted= predict(x_test_lda,m0,s0,m1,s1,ben,mal)

    confusion, accuracy, precision, recall= confusion_matrix(y_predicted,y_test)
    
    l1.append(accuracy)
    l2.append(precision)
    l3.append(recall)

    print("CONFUSION MATRIX")
    print(confusion)
    print("ACCURACY =", accuracy)
    print("PRECISION =", precision)
    print("RECALL =", recall)
    
    
    print("")
    
    
    confusion_avg+= (confusion)/N
  

ITERATION : 1
FLDM 1 

CONFUSION MATRIX
[[121.   1.]
 [  6.  60.]]
ACCURACY = 0.9627659574468085
PRECISION = 0.9836065573770492
RECALL = 0.9090909090909091

FLDM 2 : SHUFFLING THE COLUMNS

CONFUSION MATRIX
[[121.   1.]
 [  6.  60.]]
ACCURACY = 0.9627659574468085
PRECISION = 0.9836065573770492
RECALL = 0.9090909090909091

ITERATION : 2
FLDM 1 

CONFUSION MATRIX
[[115.   4.]
 [  5.  64.]]
ACCURACY = 0.9521276595744681
PRECISION = 0.9411764705882353
RECALL = 0.927536231884058

FLDM 2 : SHUFFLING THE COLUMNS

CONFUSION MATRIX
[[115.   4.]
 [  5.  64.]]
ACCURACY = 0.9521276595744681
PRECISION = 0.9411764705882353
RECALL = 0.927536231884058

ITERATION : 3
FLDM 1 

CONFUSION MATRIX
[[115.   1.]
 [  8.  64.]]
ACCURACY = 0.9521276595744681
PRECISION = 0.9846153846153847
RECALL = 0.8888888888888888

FLDM 2 : SHUFFLING THE COLUMNS

CONFUSION MATRIX
[[115.   1.]
 [  8.  64.]]
ACCURACY = 0.9521276595744681
PRECISION = 0.9846153846153847
RECALL = 0.8888888888888888

ITERATION : 4
FLDM 1 

CONFUSION 

In [5]:
##same confusion matrix as both FDLM1 and FDLM2 are have same results

accuracy_avg=  (confusion_avg[0,0]+confusion_avg[1,1])/np.sum(confusion_avg)
precision_avg= confusion_avg[0,0]/(confusion_avg[0,0]+confusion_avg[0,1])
recall_avg= confusion_avg[0,0]/(confusion_avg[0,0]+confusion_avg[1,0])
print("AVERAGE CONFUSION MATRIX",confusion_avg)
print("AVERGAE ACCURACY:",accuracy_avg)
print("AVERAGE PRECISION:",precision_avg)
print("AVERAGE RECALL",recall_avg)                                   


AVERAGE CONFUSION MATRIX [[114.9   1.7]
 [  6.6  64.8]]
AVERGAE ACCURACY: 0.9558510638297871
AVERAGE PRECISION: 0.9854202401372213
AVERAGE RECALL 0.9456790123456791


In [6]:
l1=np.array(l1)
l2=np.array(l2)
l3=np.array(l3)

std_1=np.std(l1,axis=0)
std_2=np.std(l2,axis=0)
std_3=np.std(l3,axis=0)

print("ACCURACY VARIANCE: ",std_1)
print("PRECISION VARIANCE: ",std_2)
print("RECALL VARIANCE: ",std_3)

ACCURACY VARIANCE:  0.011665804361415589
PRECISION VARIANCE:  0.014808301368943335
RECALL VARIANCE:  0.024086776605626345


# Q) The only difference between FDLM1 and FDLM2 is that in FDLM2 the columns are shuffled (we use .sample). As we can see from above results the accuracy, precision and recall are the same in both FDLM1 as well as FDLM2 for each iteration- this is because in either case all the columns are iterated through regardless because we use the same test, train row split for both of the models.