# importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder #import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

# Reading csv

In [None]:
df=pd.read_csv("../input/kidney-disease-dataset/kidney_disease.csv")

# understanding data and its features

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

# cleaning data

# Age column 

In [None]:
# As age has 9 missing values with total of 400 values which is 2.25% of total data.
# so we remove these values
indexNames = df[ (df['age'].isnull())].index
df.drop(indexNames , inplace=True)
df

# bp column

In [None]:
#similarly we drop bp column
indexNames = df[ (df['bp'].isnull())].index
df.drop(indexNames , inplace=True)
df.isnull().sum()

# sg column

In [None]:
#sg has 47 null values which is more than 11%
#so we will not drop the column
# value has very low s.d and mean
#mean=1.017408
#50%=1.02
#so we replace value with mean+50%/2
newvalue=(1.017408+1.02)/2
newvalue
df["sg"].fillna(newvalue, inplace=True)
df.isnull().sum()

# al column

In [None]:
#al has 41 missing values so we will not delete it
#mean
#min=0.0
#50%=0.0
#mean=1.016949
#max=5.0
#std around 1
newvalue=(5+0)/2
newvalue=(newvalue+1.016949)/2
df["al"].fillna(newvalue, inplace=True)
df.isnull().sum()

# Su columns


In [None]:
df["su"].mode()

In [None]:
(df["su"]==0).sum()

In [None]:
# as most frequent value is 0 equal to 276 out of 356
#so replace nan with 0
df["su"].fillna(0, inplace=True)
df.isnull().sum()

# rbc column

In [None]:
#missing value=142
df["rbc"].unique()

In [None]:
#rbc has two values normal, abnormal
(df["rbc"]=='normal').sum()

In [None]:
(df["rbc"]=='abnormal').sum()

In [None]:
#normal has 194 and abnormal has 43 values
#so we replace nan with normal
df["rbc"].fillna('normal', inplace=True)

In [None]:
df["rbc"].unique()

# pc column

In [None]:
print(df["pc"].unique())
(df["pc"]=='normal').sum()

In [None]:
#normal has 251 and abnormal has 72 values so replace nan with normal
df["pc"].fillna('normal', inplace=True)

# pcc and ba,bu,bgr and sc column

In [None]:
# as pcc and ba has only 3 nan values so we delete these rows
#same with all above described names
indexNames = df[ (df['pcc'].isnull())].index
df.drop(indexNames , inplace=True)
indexNames = df[ (df['ba'].isnull())].index
df.drop(indexNames , inplace=True)
indexNames = df[ (df['bu'].isnull())].index
df.drop(indexNames , inplace=True)
indexNames = df[ (df['bgr'].isnull())].index
df.drop(indexNames , inplace=True)
indexNames = df[ (df['sc'].isnull())].index
df.drop(indexNames , inplace=True)

In [None]:
df.isnull().sum()

# sod column

In [None]:
#sod has mean 137 and 50% 138 and mode 135
#so we replace 138 with nan
df["sod"].fillna(138, inplace=True)
df.isnull().sum()

# Pot columns

In [None]:
df

In [None]:
#max=47, min 4.6, 50% 3.8, mean 3.37
#we will find mode
#mode 5(28 times)
#replace with 5 nan
df["pot"].fillna(5, inplace=True)
df.isnull().sum()

# hemo

In [None]:
#replace with mean 12.53
df["hemo"].fillna(5, inplace=True)
df.isnull().sum()

# pcv ,wc,rc

In [None]:
df["pcv"].fillna(44, inplace=True)#mode 44
df['wc'].fillna(6700, inplace=True)#mode 6700
df['rc'].fillna(5.2, inplace=True)#mode 5.2

In [None]:
df.isnull().sum()

# htn dm cad appet pe  ane 

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df


In [None]:
df.columns

# clean data to csv

In [None]:
df.reset_index(drop=True, inplace=True) 

In [None]:
df.drop("id",axis=1,inplace=True)

In [None]:
df.index.names = ['']

In [None]:
df["pcv"].unique()


In [None]:
df["pcv"].replace('?',44) #most frequent value

In [None]:
df.to_csv("missingremoved_kidney1.csv")

# Data wrangling

# categorical variables

In [None]:

df1=pd.read_csv("../input/kidney/missingremoved_kidney.csv")

#some values are cleaned directly in csv(2 to 3 values)

# Mapping the text to 1/0 and cleaning the dataset 


In [None]:
def catconversion(df1):
    df1[['htn','dm','cad','pe','ane']] = df1[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
    df1[['rbc','pc']] = df1[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
    df1[['pcc','ba']] = df1[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
    df1[['appet']] = df1[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
    df1['pe'] = df1['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
    df1['appet'] = df1['appet'].replace(to_replace='no',value=0)
    df1['cad'] = df1['cad'].replace(to_replace='\tno',value=0)
   
df1['dm'] = df1['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df1['classification'] = df1['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
df1.rename(columns={'classification':'class'},inplace=True)

catconversion(df1)


    

In [None]:
df1.head()

# other variables(scale)

In [None]:
df1.describe()

In [None]:

df1["age"].hist()

In [None]:
#from diagram its clear that there are outliers both at lower and upper point
#use min max scaler
df1["age"]=(df1["age"]-df1["age"].min())/(df1["age"].max()-df1["age"].min())
df1["age"].describe()


In [None]:
plt.plot(df1["bp"])

In [None]:
df1["bp"]=(df1["bp"]-df1["bp"].min())/(df1["bp"].max()-df1["bp"].min())
df1["bp"].describe()

In [None]:
plt.plot(df1["bp"])

In [None]:
plt.plot(df1["sg"])

In [None]:
df1["sg"]=(df1["sg"]-df1["sg"].min())/(df1["sg"].max()-df1["sg"].min())
df1["sg"].describe()

In [None]:
plt.plot(df1["sg"])

In [None]:
plt.plot(df1["al"])

In [None]:
df["al"].hist()

In [None]:
df1["al"]=(df1["al"]-df1["al"].min())/(df1["al"].max())
df1["al"].describe()

In [None]:
df1["al"].hist()


In [None]:
pd.set_option('display.max_columns', None)
df1.head()

In [None]:
df1["su"]=(df1["su"]-df1["su"].min())/(df1["su"].max())
df1["su"].describe()
df1.describe()

In [None]:
df1["bgr"].hist()

In [None]:
df1["bgr"]=(df1["bgr"]-df1["bgr"].min())/(df1["bgr"].max()-df1["bgr"].min())
df1["bgr"].describe()

In [None]:
df1["bu"]=(df1["bu"]-df1["bu"].min())/(df1["bu"].max()-df1["bu"].min())
df1["bu"].describe()

In [None]:
df1["sc"]=(df1["sc"]-df1["sc"].min())/(df1["sc"].max()-df1["sc"].min())
df1["sc"].describe()

In [None]:
df1["sod"]=(df1["sod"]-df1["sod"].min())/(df1["sod"].max()-df1["sod"].min())
df1["sod"].describe()

In [None]:
df1["pot"]=(df1["pot"]-df1["pot"].min())/(df1["pot"].max()-df1["pot"].min())
df1["hemo"]=(df1["hemo"]-df1["hemo"].min())/(df1["hemo"].max()-df1["hemo"].min())
df1["wc"]=(df1["wc"]-df1["wc"].min())/(df1["wc"].max()-df1["wc"].min())
df1["pcv"]=(df1["pcv"]-df1["pcv"].min())/(df1["pcv"].max()-df1["pcv"].min())
df1["rc"]=(df1["rc"]-df1["rc"].min())/(df1["rc"].max()-df1["rc"].min())

df1.head(5)

In [None]:
df1.max()

In [None]:
df1.to_csv("wranglingdone_kidney1.csv")

In [None]:
df["pcv"]


In [None]:
df1["sc"]

In [None]:
df["sc"].hist()
df3=pd.read_csv("missingremoved_kidney.csv")
def noncatcanv(dfnew):
    dfnew["pot"]=(dfnew["pot"]-df3["pot"].min())/(df3["pot"].max()-df3["pot"].min())
    dfnew["hemo"]=(dfnew["hemo"]-df3["hemo"].min())/(df3["hemo"].max()-df3["hemo"].min())
    dfnew["wc"]=(dfnew["wc"]-df3["wc"].min())/(df3["wc"].max()-df3["wc"].min())
    dfnew["pcv"]=(dfnew["pcv"]-df3["pcv"].min())/(df3["pcv"].max()-df3["pcv"].min())
    dfnew["rc"]=(dfnew["rc"]-df3["rc"].min())/(df3["rc"].max()-df3["rc"].min())
    dfnew["sod"]=(dfnew["sod"]-df3["sod"].min())/(df3["sod"].max()-df3["sod"].min())
    dfnew["sc"]=(dfnew["sc"]-df3["sc"].min())/(df3["sc"].max()-df3["sc"].min())
    dfnew["bu"]=(dfnew["bu"]-df3["bu"].min())/(df3["bu"].max()-df3["bu"].min())
    dfnew["al"]=(dfnew["al"]-df3["al"].min())/(df3["al"].max())
    dfnew["sg"]=(dfnew["sg"]-df3["sg"].min())/(df3["sg"].max()-df3["sg"].min())
    dfnew["bp"]=(dfnew["bp"]-df3["bp"].min())/(df3["bp"].max()-df3["bp"].min())
    dfnew["su"]=(dfnew["su"]-df3["su"].min())/(df3["su"].max())
    dfnew["age"]=(dfnew["age"]-df3["age"].min())/(df3["age"].max()-df3["age"].min())
    dfnew["bgr"]=(dfnew["bgr"]-df3["bgr"].min())/(df3["bgr"].max()-df3["bgr"].min())


# Knn algorithm applying

In [None]:
len(df1.columns)
df1

In [None]:
#split data into x and y
X = df1.iloc[:, 1:-1].values
y = df1.iloc[:, 25].values
y=y.astype('int')
# train and test data set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
#knn
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(X,y)

In [None]:
y_pred = classifier.predict(X_test)
#classification report
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#abc=["0.440476","0.076923","1.0000","0.000000",0.0	0	0	0	0	0.202991	0.111682	0.006289	0.627119	0.042697	0.673469	1.000000	0.214876	0.559322	0	0	0	1	0	0	0]
y_pred = classifier.predict(X_test)
y_pred

In [None]:
abc=[[0.40476190476190477, 0.23076923076923078, 0.7500000000000056,
        0.0, 0, 0, 0, 0, 0, 3.2, 0.03465982028241335,
        0.0062893081761006275, 0.7796610169491526, 0.05393258426966293,
        0.12925170068027209, 0.7777777777777778, 0.1859504132231405,
        0.5254237288135593, 0, 0, 0, 1, 0, 0]]
cde=[[0.40476190476190477, 0.23076923076923078, 0.7500000000000056,
        0.0, 0.0, 0, 0, 0, 0, 0.20726495726495728, 0.03465982028241335,
        0.0062893081761006275, 0.7796610169491526, 0.05393258426966293,
        0.12925170068027209, 0.7777777777777778, 0.1859504132231405,
        0.5254237288135593, 0, 0, 0, 1, 0, 0]]
print(classifier.predict(abc))
print(classifier.predict(cde))
def result(dfnew):
    catconversion(dfnew)
    noncatcanv(dfnew)
    news=[];
    out=classifier.predict(dfnew)
    for i in out:
        if(out[i-1]==1):
            news.append("bad news you have ckd")
        else:
            news.append("no ckd")
    return news
dfnew=pd.read_csv("missingremoved_kidney.csv")

result(dfnew.iloc[67:70,1:-1])



# error predication

In [None]:
dfnew

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')