In [None]:
# importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/indian-liver-patient-records/indian_liver_patient.csv")
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# showing column wise %ge of NaN values they contains
null_col = []
for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)
  if df[i].isna().mean()*100 > 0:
    null_col.append(i)

> Since no column has signficant missing values, there is no need to drop column here . Now fill the num values of column *Albumin_and_Globulin_Ratio* the mean of column (as it is only column with few Nan values)


In [None]:
for i in null_col:
  df[i] = df[i].fillna(df[i].mean())

# lets check for null values again
for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)

In [None]:
# Checking for unbalanced dataset

plt.figure(figsize=(5,5))
ax = sns.countplot(x='Dataset', data=df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


> Here this bar graph easily shows how data is imbalanced. Less than 30% data is in class __2__. So, first, we have to balance the data in to get more precise predictions.


> For that we are using Over sampling


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler()
x, y = oversample.fit_resample(df.drop(['Dataset'], axis=1), df['Dataset'])

new_df = pd.DataFrame(x, columns=df.drop(['Dataset'], axis=1).columns)
new_df['Dataset'] = y

new_df.head()



In [None]:
plt.figure(figsize=(5,5))
ax = sns.countplot(x='Dataset', data=new_df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


> Here we can see that all the classes are balanced.

> Since we got only one column *Gender* with string value, let's encode it in numerical value

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
new_df['Gender'] = enc.fit_transform(new_df['Gender'].astype('str'))


In [None]:
new_df.head()

In [None]:
new_df.shape

In [None]:
new_df.info()

> Since due to over sampling some of columns get converted in *objec* type, lets convert them back in numericals

In [None]:
for i in new_df.select_dtypes(include=['object']).columns:
  new_df[i] = new_df[i].astype(str).astype(float)

>Let's further see how other attributes are related to each other using pairplot

In [None]:
cormap = new_df.corr()
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cormap, annot = True)

In [None]:
sns.pairplot(data=new_df, hue='Dataset', corner=True)


## **KNN**

In [None]:
X = new_df.drop(['Dataset'], axis=1)
y = new_df['Dataset']

In [None]:
# Scale the data to be between -1 and 1

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
# model training 

from sklearn.neighbors import KNeighborsClassifier as KNN

model= KNN()   
model.fit(X_train, y_train)

In [None]:
model.get_params()

> Hyper parameter tunning


In [None]:
from sklearn.model_selection import GridSearchCV
n_neighbors = [x for x in range(5, 86, 2)]
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
weights = ['uniform', 'distance']

grid = {'n_neighbors': n_neighbors,
        'algorithm': algorithm,
        'weights': weights}

In [None]:
new_model = KNN() 

knn_grid = GridSearchCV(estimator = new_model, param_grid = grid, cv = 7, verbose=0)
knn_grid.fit(X_train, y_train)

In [None]:
knn_grid.best_params_

In [None]:
y_pred = knn_grid.best_estimator_.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(mat, annot = True)

In [None]:
from sklearn import metrics

# Measure the Accuracy Score
print("Accuracy score of the predictions: {value:.2f} %".format(value=metrics.accuracy_score(y_pred, y_test)*100))
