In [None]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#Reading the dataset
kidney = pd.read_csv("../input/kidney_disease.csv")
kidney.head()

In [None]:
kidney[['htn','dm','cad','pe','ane']] = kidney[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
kidney[['rbc','pc']] = kidney[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
kidney[['pcc','ba']] = kidney[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
kidney[['appet']] = kidney[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
kidney['classification'] = kidney['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
kidney.rename(columns={'classification':'class'},inplace=True)

kidney['pe'] = kidney['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
kidney['appet'] = kidney['appet'].replace(to_replace='no',value=0)
kidney['cad'] = kidney['cad'].replace(to_replace='\tno',value=0)
kidney['dm'] = kidney['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
kidney.drop('id',axis=1,inplace=True)

In [None]:
kidney.shape

In [None]:
# Information about the dataset
kidney.info()

In [None]:
kidney.corr()

In [None]:
cor_mat=kidney.corr()
fig,ax=plt.subplots(figsize=(10,7))
sns.heatmap(cor_mat,annot=True,linewidths=0.5,fmt=".3f")

In [None]:
len(kidney)-kidney.count()

In [None]:
kidney.isnull()

In [None]:
kidney.isna().sum()

In [None]:
kidney.fillna(1)

In [None]:
X = kidney.drop('al',axis=1).values
y = kidney['al'].values

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale

# set seed for reproducibility
np.random.seed(0)

# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size = 1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scale(original_data)

# plot both together to compare
fig, ax=plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0], color='y')
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")
plt.show()

In [None]:
kidney2 = kidney.dropna()
print(kidney2.shape)

In [None]:
print(kidney2.groupby('class').bp.max())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(kidney2.iloc[:,:-1], kidney2['class'], test_size=0.33, random_state=44, stratify= kidney2['class'])

In [None]:
print(X_train.shape)

In [None]:
y_train.value_counts()

In [None]:
rfc = RandomForestClassifier(random_state = 22)
rfc_fit = rfc.fit(X_train,y_train)

In [None]:
rfc_pred = rfc_fit.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_pred))

In [None]:
print(classification_report(y_test,rfc_pred))

In [None]:
accuracy_score( y_test, rfc_pred)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

In [None]:
accuracy_score( y_test,pred)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
accuracy_score( y_test, predictions)

In [None]:
dp_list = np.arange(3, 30)
train = []
test = []

for depth in dp_list:
    forest = RandomForestClassifier(max_depth=depth, n_jobs = -1, random_state=42)
    forest.fit(X_train, y_train)
    prediction = forest.predict(X_test)
    trainpred = forest.predict(X_train)
    train_acc = accuracy_score(y_train, trainpred)
    test_acc = accuracy_score(y_test, prediction)
    train.append(train_acc)
    test.append(test_acc)
    
performance = pd.DataFrame({'n_estimators':dp_list,'Train_acc':train,'Test_acc':test})

fig, ax = plt.subplots()
x_axis = dp_list
ax.plot(x_axis, performance['Train_acc'], label='Train')
ax.plot(x_axis, performance['Test_acc'], label='Test')
ax.legend()
plt.ylabel('accuracy')
plt.title('Forest accuracy vs depth')
plt.show()

In [None]:
#Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
for i, k in enumerate(neighbors):
    
    # Setup a k-NN Classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
plt.title('KNN varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
features_train, features_test, labels_train, labels_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
features_train.shape

In [None]:
features_test.shape

In [None]:
labels_train.shape

In [None]:
labels_test.shape

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)

#Fit the classifier to the training data
knn.fit(X_train, y_train)

#Print the accuracy
print(knn.score(X_test, y_test))
print("Accuracy = {}".format(round(knn.score(X_test, y_test),2) * 100)+"%")

In [None]:
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
for i, k in enumerate(neighbors):
    
    # Setup a k-NN Classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
plt.title('KNN varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
y_pred_prob = knn.predict_proba(X_test)[:,1]

# Generate ROC curve values (false positive rate, true positive rate, thresholds)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
kidney2.astype(np.float)

In [None]:
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(knn, X_train, y_train, cv=5, scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))

In [None]:
kidney22 = kidney2.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
kidney22.isnull().sum()

In [None]:
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(knn, X_train, y_train, cv=5, scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(kidney22.iloc[:,:-1], kidney22['class'],
                                                    test_size = 0.33, random_state=44,
                                                   stratify = kidney22['class'])

In [None]:
y_train.value_counts()

In [None]:
rfc = RandomForestClassifier(random_state = 22)
rfc_fit = rfc.fit(X_train,y_train)

In [None]:
rfc_pred = rfc_fit.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_pred))

In [None]:
print(classification_report(y_test,rfc_pred))

In [None]:
accuracy_score( y_test, rfc_pred)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
accuracy_score( y_test, rfc_pred)

In [None]:
dtree=DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions=dtree.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)/Graphviz2.38/bin/'

features = list(kidney2.columns[1:])
features

In [None]:
dot_data = StringIO()
export_graphviz(dtree, out_file = dot_data,feature_names = features,filled = True,rounded=True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_pred))

In [None]:
print(classification_report(y_test,rfc_pred))

In [None]:
accuracy_score( y_test, rfc_pred)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

In [None]:
accuracy_score( y_test,pred)

In [None]:
print("confussion matrix")
knn_conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(confusion_matrix(y_test, y_pred), annot= True, cmap='YlGnBu',fmt = 'g')
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.svm import SVC


svm_model = SVC(kernel='rbf', gamma=0.1, C=1.0, probability=True)
svm_model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
svm_model = SVC(kernel='rbf', gamma=0.1, C=1.0)


params = {"C":(0.1, 0.5, 1, 2, 5, 10, 20), 
          "gamma":(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1), 
          "kernel":('linear', 'poly', 'rbf')}

svm_grid = GridSearchCV(svm_model, params, n_jobs=-1,   cv=5, verbose=1, scoring="accuracy")
svm_grid.fit(X_train, y_train)

In [None]:
 svm_grid.best_estimator_