In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [21]:
# Loading datasets

#path = 'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\'

df_knn_imputed_uniform_3 = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_knn_imputed_uniform_3.csv')
df_knn_imputed_uniform_5 = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_knn_imputed_uniform_5.csv')
df_knn_imputed_uniform_7 = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_knn_imputed_uniform_7.csv')
df_knn_imputed_uniform_9 = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_knn_imputed_uniform_9.csv')
df_knn_imputed_uniform_11 = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_knn_imputed_uniform_11.csv')
df_mean_imputed = pd.read_csv(r'D:\Courses\Data-Science_Tech\INeuron_Courses\MACHING_LEARNING_LIVE-CLASS\Research Papers\HealthCare_Domain\Chronic_Kidney_Disease_Prediction\WORKING\CDK_Prediction\data\df_mean_imputed.csv')

In [22]:
df_mean_imputed.dtypes

rbc      float64
pc       float64
pcc      float64
ba       float64
htn      float64
dm       float64
cad      float64
appet    float64
pe       float64
ane      float64
class      int64
age      float64
bp       float64
sg       float64
al       float64
su       float64
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
dtype: object

### Models(LOG, KNN, RF, SVM, NB) Applying on the KNN-Imputed & Mean-Imputed Dataset

In [23]:
# Defining Dataframes and Dataframes names 

dfs = [df_knn_imputed_uniform_3,df_knn_imputed_uniform_5,
        df_knn_imputed_uniform_7,df_knn_imputed_uniform_9,
        df_knn_imputed_uniform_11,df_mean_imputed]
    
dfs_name = ["knn_imputed_3", "knn_imputed_5",
                "knn_imputed_7","knn_imputed_9",
                "knn_imputed_11","mean_imputed"]

In [30]:
# Multiple models selection loop and respective train-test split, training,testing,metrics recording

def select_model(x):
    
    if(x==0):
        classifier = LogisticRegression()
    elif(x==1):
        classifier = KNeighborsClassifier()
    elif(x==2):
        classifier = RandomForestClassifier(n_estimators = 100)
    elif(x==3):
        classifier = SVC(kernel='rbf',random_state=None)
    else:
        classifier = GaussianNB()
            
    res = {}

    for i,df in enumerate(dfs):
        x = df.drop('class',axis=1)
        y = df['class']

        # Splitting data and shuffle as true as our data classification is ordered

        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,shuffle = True ) 

        # Applying Standard scaler

        scaler = StandardScaler()


        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.fit_transform(x_test)

        #Fitting model
        classifier.fit(x_train_scaled,y_train)

        #Train and Test Prediction

        train_pred = classifier.predict(x_train_scaled)
        test_pred = classifier.predict(x_test_scaled)

        #Train and test Accuracy

        train_acc = accuracy_score(y_train,train_pred)
        test_acc = accuracy_score(y_test,test_pred)

        #Confusion matrix metrics

        tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()

    

        #Logging all above metrixs into one table res

        res[dfs_name[i]] = [train_acc, test_acc, tn, fp, fn, tp]
    return res





    



In [32]:
for i in range(6):
    result = select_model(i)
    res = pd.DataFrame.from_dict(data = result,orient='index', columns = ['train_acc', 'test_acc', 'true_neg', 'false_pos', 'false_neg', 'true_pos'])
    if(i==0):
        print("\033[1m",i+1,".PERFORMANCE INDICATORS for Logistic Regression \n")
    elif(i==1):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for KNN \n")
    elif(i==2):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Random Forest \n")
    elif(i==3):
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for SVM \n")
    else:
        print(i+1,"\033[1m"+".PERFORMANCE INDICATORS for Naive Bayes \n")

    print(res,'\n')

[1m 1 .PERFORMANCE INDICATORS for Logistic Regression 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.992537  0.992424        48          0          1        83
knn_imputed_5    1.000000  0.984848        50          2          0        80
knn_imputed_7    0.996269  0.977273        45          1          2        84
knn_imputed_9    1.000000  0.977273        52          2          1        77
knn_imputed_11   0.996269  0.984848        54          2          0        76
mean_imputed     1.000000  1.000000        51          0          0        81 

2 [1m.PERFORMANCE INDICATORS for KNN 

                train_acc  test_acc  true_neg  false_pos  false_neg  true_pos
knn_imputed_3    0.988806  0.924242        44          0         10        78
knn_imputed_5    0.970149  0.969697        47          0          4        81
knn_imputed_7    0.981343  0.962121        42          0          5        85
knn_imputed_9    0.981343  0.946970        

### Metrics are various models are logged above without using Feature selection. It can observed that model performace can be improved. This can be done by using Feature selection to select particular important features and then build model on those features.