In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_roc_curve

import warnings
warnings.filterwarnings('ignore')

In [None]:
path = '/kaggle/input/ckdisease/kidney_disease.csv'
df = pd.read_csv(path)
df.head()

In [None]:
#Cleaning of the dataset

df.rename(columns={'classification':'class'},inplace=True)
df['class']=df['class'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
df.drop('id',axis=1,inplace=True)

In [None]:
#Filling NA values with the mean, we can do median too

for i in ['rc','wc','pcv']:
    df[i] = df[i].str.extract('(\d+)').astype(float)
    
for i in ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','rc','wc','pcv']:
    df[i].fillna(df[i].mean(),inplace=True)
    
df = df.dropna(axis=1)

In [None]:
#Splitting the data into training and testing using 20% as testing threshold as Overall data is less

X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, test_size=0.2)

In [None]:
model = {}

logistic_regression = LogisticRegression(solver='liblinear', multi_class='auto')
model['logistic_regression'] = {'model': logistic_regression,
                                'params':{'clf__C':[1,5,10]}}

model['decision_tree'] = {'model': DecisionTreeClassifier(criterion='entropy'),
                          'params':{ 'clf__min_samples_split' : [4,5,6,7,8,9,10]}}

model['naive_bayes'] = {'model': GaussianNB(),
                        'params':{'clf__var_smoothing': np.logspace(0,-9, num=100)}}

In [None]:
models = {}

for k in list(model.keys()):
    
    model[k]['params']['pca__n_components'] =  [5, 15, 30, 45, 64]
    
    pipeline = Pipeline([
        ('standard', StandardScaler()),
        ('pca', PCA()),
        ('clf', model[k]['model']),
    ])

    parameters = [
        model[k]['params']
    ]

    clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
    clf.fit(X_train, y_train)
    models[k] = {
        'model': clf,
        'best_score':clf.best_score_,
        'best_params':clf.best_params_
    }

In [None]:
for k in models:
    print("-" * 70)
    print('Model : ', k)
    print("-" * 70)
    print()
    preds = models[k]['model'].predict(X_test)
    print(classification_report(preds, y_test))
    print()

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(), annot=True, cmap='Blues')

In [None]:
sns.pairplot(df)