# 1. Initial Setup and Overview

In [None]:
import numpy as np    #import numpy for number array handling
import pandas as pd   #import pandas for data processing (CSV file I/O)

#Visualization of data
import matplotlib.pyplot as plt
import seaborn as sns;sns.set(font_scale=1.2)

#Allow charts and graphics to display right below the page of browser setup
%matplotlib inline

kidneyData = pd.read_csv('../input/chronic-kidney-disease/new_model.csv')  #import the dataset for ML algorithm

#view dataset information and count of null values
kidneyData.info()
kidneyData.isnull().sum()

In [None]:
#display unique values in each column
kidneyData.nunique()

In [None]:
#display overall statistical analysis of the dataset
kidneyData.describe()

In [None]:
#display top most rows of the dataset
kidneyData.head()

# 2. Visualization of the Dataset

In [None]:
#Visualization of the distribution of Outcome(target) column values 
fig1, axes = plt.subplots(1,1)
plt.title('Outcome Distribution')
sns.barplot(x=kidneyData['Class'].unique(), y=kidneyData['Class'].value_counts(), data=kidneyData)
plt.xticks([0,1])
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

#Visualization of all the Features against the Outcome(target) 
fig2, axes = plt.subplots(5,3,figsize=(20,20))
fig2.suptitle('Dataset Feature Distributions',fontsize=25)
i=0;j=0;k=0;l=0;n=0
for column,value in kidneyData.iteritems():
    if(column == 'Bp' or column == 'Sg' or column == 'Al'):
        sns.stripplot(x="Class", y=column, data=kidneyData, ax=axes[0,i])
        i=i+1
    elif(column == 'Su' or column == 'Rbc' or column == 'Bu'):
        sns.stripplot(x="Class", y=column, data=kidneyData, ax=axes[1,j])
        j=j+1
    elif(column == 'Sc' or column == 'Sod' or column == 'Pot'):
        sns.stripplot(x="Class", y=column, data=kidneyData, ax=axes[2,k])
        k=k+1
    elif(column == 'Hemo' or column == 'Wbcc' or column == 'Rbcc'):
        sns.stripplot(x="Class", y=column, data=kidneyData, ax=axes[3,l])
        l=l+1
    elif(column == 'Htn'):
        sns.stripplot(x="Class", y=column, data=kidneyData, ax=axes[4,n])
        n=n+1

plt.subplots_adjust(wspace= 0.2,hspace=0.3)
plt.show()

#Visualization of the correlation between features toward each other
fig3, axes = plt.subplots(1,1,figsize=(8,6))
plt.title('Correlation Between Features of Dataset')
sns.heatmap(kidneyData.drop(columns = ['Class']).corr(), cmap="YlGnBu")
plt.show()

# 3. Extraction of Features and Labels from Dataset

In [None]:
#Extracting the feature attributes from dataset
X = kidneyData.drop(columns = ['Class'])
#Extracting the target(label) attributes from dataset
y = kidneyData['Class']

# 4. Data Preprocessing

In [None]:
#Preprocess the out of range Hypertension(Htn) data
X.loc[(X['Htn'] != 0) & (X['Htn'] != 1),'Htn'] = 0

#Prrocessing of Categorical Data and Normalize(Standardize) Numerical Data 
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(categories='auto'), ['Rbc','Htn']),
                                       ('Standardizer', StandardScaler(), ['Bp','Sg','Al','Su','Bu','Sc','Sod','Pot','Hemo','Wbcc','Rbcc'])],
                                       remainder='passthrough')
X = columnTransformer.fit_transform(X)


# 5. Splitting Dataset into Train and Test Set

In [None]:
#Seperate the CKD dataset for two parts as training dataset and testing dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 200)
print('Training Feature Size :- ', X_train.shape )
print('Training Label Size :- ', y_train.shape )
print('Testing Feature Size :- ', X_test.shape )
print('Testing Label Size :- ', y_test.shape )

# 6. Implement Logistic Regression Model

In [None]:
#Implementation of the Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

logisticRegression = LogisticRegression(solver='liblinear')
logisticRegression.fit(X_train,y_train)


# 7. Prediction Using Trained Model

In [None]:
#Using test features of the dataset to see the predictions 
kidneyPrediction = logisticRegression.predict(X_test)
kidneyPrediction

# 8. Testing Results

In [None]:
from sklearn import metrics

# Generate the Acuuray of ML Model
accuracy = metrics.accuracy_score(y_test, kidneyPrediction)
print("Accuracy of the Model :-", accuracy * 100)


In [None]:
# Generate the Confusion Matrix of the predictions
confusionMatrix = metrics.confusion_matrix(y_test, kidneyPrediction)

fig4, axes = plt.subplots(1,1,figsize=(5,4))
sns.heatmap(confusionMatrix, annot=True, cmap='YlGn')
axes.set_ylim([2,0])
plt.title("Confusion Matrix")
plt.ylabel("Actual Outcome")
plt.xlabel("Predicted Outcome")
plt.show()


In [None]:
# Generate Classification Report for the predictions
classificationReport = metrics.classification_report(y_test, kidneyPrediction)
print(classificationReport)