In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report,roc_curve,roc_auc_score
from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

!pip install xgboost
from xgboost import XGBClassifier

plt.rcParams['figure.figsize']=(15,10)
plt.rcParams['figure.dpi']=300
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load The DataSet
df=pd.read_csv("project-data.csv",delimiter=";")
df

## **Exploratory Data Analysis**

In [None]:
df.describe()

In [None]:
df.describe(include=object)

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.rename(columns={"protein   ":"protein"},inplace=True)

In [None]:
# since the the protein column contains numerical columns biut it's of object data type
# we convert the protein column from object data type to float datatype
df["protein"]

In [None]:
df["protein"].unique()

In [None]:
# Convert the protein column from object datatype to float datatype

df['protein'] = pd.to_numeric(df['protein'], errors='coerce')

In [None]:
df["protein"].unique()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# to find the duplicate values in the dataset
df.duplicated().sum()
# there are no duplicate values

In [None]:
# to impute the missing values
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(),cmap="viridis")
plt.tight_layout()

in the dataset the columns such as albumin, alkaline_phosphatase, alanine_aminotransferase, cholesterol and protein contains missing values

to replace the missing values check for outliers

Since the columns are numerical


*   if outliers are present replace the missing values with the median
*   otherwise replace the missing values with the mean





In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
df.hist()
plt.tight_layout()

In [None]:

for i in df.isna().sum():
  print(i/len(df)*100)

In [None]:
df["albumin"].median()

In [None]:
df["albumin"].fillna(df["albumin"].median(),inplace=True)

In [None]:
df["alkaline_phosphatase"].median()


In [None]:
df["alkaline_phosphatase"].fillna(df["alkaline_phosphatase"].median(),inplace=True)

In [None]:
df["alanine_aminotransferase"].median()

In [None]:
df["alanine_aminotransferase"].fillna(df["alanine_aminotransferase"].median(),inplace=True)

In [None]:
df["cholesterol"].median()

In [None]:
df["cholesterol"].fillna(df["cholesterol"].median(),inplace=True)

In [None]:
df["protein"].median()

In [None]:
df["protein"].fillna(df["protein"].median(),inplace=True)

In [None]:
df.isnull().sum()

There are 2 categorical columns in the Data set, Convert them into numerical

In [None]:
df["sex"].value_counts()

In [None]:
df['sex'] = df['sex'].map({'f': 0, 'm': 1})

In [None]:
df["category"].value_counts()

In [None]:
le=LabelEncoder()
df["category"]=le.fit_transform(df["category"])

In [None]:
df

In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
# to treat the outliers
#capping replacing outlier values is called capping
#In capping all outlier values will be replaced by upper extreame or lower extreame
#Outliesr detection : user defined function to calculate upper extreame and lower extreame
def outlier_detection(data,colname):
  q1=data[colname].quantile(0.25)
  q2=data[colname].quantile(0.75)

  iqr=q2-q1
  upper_extreame=q2+(1.5*iqr)
  lower_extreame=q1-(1.5*iqr)
  return lower_extreame,upper_extreame,q1,q2


In [None]:
outlier_detection(df,"category")

In [None]:
outlier_detection(df,"age")

In [None]:
outlier_detection(df,"albumin")

In [None]:
outlier_detection(df,"alkaline_phosphatase")

In [None]:
outlier_detection(df,"alanine_aminotransferase")

In [None]:
outlier_detection(df,"aspartate_aminotransferase")

In [None]:
outlier_detection(df,"bilirubin")

In [None]:
outlier_detection(df,"cholinesterase")

In [None]:
outlier_detection(df,"cholesterol")

In [None]:
outlier_detection(df,"creatinina")

In [None]:
outlier_detection(df,"gamma_glutamyl_transferase ")

In [None]:
outlier_detection(df,"protein")

In [None]:
df.loc[df["category"]>3.0,"category"]=3.0

df.loc[df["age"]>76.5,"age"]=76.5

df.loc[df["albumin"]>54.8,"albumin"]=54.8
df.loc[df["albumin"]<29.199,"albumin"]=29.199

df.loc[df["alkaline_phosphatase"]>118.824,"alkaline_phosphatase"]=118.824
df.loc[df["alkaline_phosphatase"]<13.425,"alkaline_phosphatase"]=13.425

df.loc[df["alanine_aminotransferase"]>58.0249,"alanine_aminotransferase"]=58.0249

df.loc[df["aspartate_aminotransferase"]>49.849,"aspartate_aminotransferase"]=49.849

df.loc[df["bilirubin"]>20.049,"bilirubin"]=20.049

df.loc[df["cholinesterase"]>13.5724,"cholinesterase"]=13.5724
df.loc[df["cholinesterase"]<2.9525,"cholinesterase"]=2.9525

df.loc[df["cholesterol"]>8.2075,"cholesterol"]=8.2075
df.loc[df["cholesterol"]<2.4675,"cholesterol"]=2.4675

df.loc[df["creatinina"]>119.5,"creatinina"]=119.5
df.loc[df["creatinina"]<35.5,"creatinina"]=35.5

df.loc[df["gamma_glutamyl_transferase "]>76.950,"gamma_glutamyl_transferase "]=76.950

df.loc[df["protein"]>84.550,"protein"]=84.550
df.loc[df["protein"]<60.149,"protein"]=60.149

In [None]:
df

In [None]:
df["category"].value_counts()

In [None]:
sns.countplot(x='category', data=df,palette="viridis")
plt.title('Liver Disease Categories Distribution')
plt.xlabel('Diagnosis Category')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
plt.subplot(3,4,1)
sns.distplot(df["age"])
plt.tight_layout()

plt.subplot(3,4,2)
sns.distplot(df["albumin"])
plt.tight_layout()

plt.subplot(3,4,3)
sns.distplot(df["alkaline_phosphatase"])
plt.tight_layout()

plt.subplot(3,4,4)
sns.distplot(df["alanine_aminotransferase"])
plt.tight_layout()

plt.subplot(3,4,5)
sns.distplot(df["aspartate_aminotransferase"])
plt.tight_layout()

plt.subplot(3,4,6)
sns.distplot(df["bilirubin"])
plt.tight_layout()

plt.subplot(3,4,7)
sns.distplot(df["cholinesterase"])
plt.tight_layout()

plt.subplot(3,4,8)
sns.distplot(df["cholesterol"])
plt.tight_layout()

plt.subplot(3,4,9)
sns.distplot(df["creatinina"])
plt.tight_layout()

plt.subplot(3,4,10)
sns.distplot(df["gamma_glutamyl_transferase "])
plt.tight_layout()

plt.subplot(3,4,11)
sns.distplot(df["protein"])
plt.tight_layout()




In [None]:
df.describe()

In [None]:
sns.pairplot(df,hue="category",palette="rainbow")
plt.tight_layout()

In [None]:
df.hist(color="yellow",edgecolor="black")
plt.tight_layout()


In [None]:
sns.boxplot(df,palette="rainbow")
plt.tight_layout()

In [None]:
pd.crosstab(df["sex"],df["category"]).plot(kind="bar")
plt.tight_layout()
plt.title("Gender Differncing based on category")
plt.xlabel("SEX")
plt.ylabel("COUNT")

In [None]:
category_counts = df['category'].value_counts()
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Categories')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6),facecolor='teal',edgecolor="red",)
sns.scatterplot(x="age",y="albumin",data=df,hue="category",palette="coolwarm")
plt.tight_layout()
plt.title("AGE VS ALBUMIN")
plt.xlabel("AGE")
plt.ylabel("ALBUMIN")

In [None]:
plt.figure(figsize=(10,6),facecolor="blue",frameon=True)
plt.violinplot(df)

In [None]:
sns.swarmplot(x="sex",y="cholesterol",data=df,size=6,hue="category")

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True,cmap="RdYlGn")
plt.tight_layout()

In [None]:
x=df.drop("category",axis=1)
y=df["category"]

In [None]:
# standardization

sc=StandardScaler()
x1=sc.fit_transform(x)

In [None]:
x=pd.DataFrame(x1,columns=x.columns)
x

In [None]:
y # target column

## **MODEL BUILDING**

In [None]:
# splitting the dataset into training(77%) and testing(33%)data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

### **1.Logistic Regression**

In [None]:
model1=LogisticRegression()
model1.fit(x_train,y_train)

In [None]:
y_predict1=model1.predict(x_test)
y_predict1

In [None]:
df1=pd.DataFrame({"Actual":y_test,"predicted":y_predict1})
df1

In [None]:
# confusion matrix for the model accuracy TP  FP
#                                         FN  TF

conf_matrix=confusion_matrix(y_test,y_predict1)
print(conf_matrix)

In [None]:
print("training accuracy:", model1.score(x_train,y_train))
print("testing accuracy:", model1.score(x_test,y_test))

In [None]:
class_report1=classification_report(y_test,y_predict1)
print(class_report1)

## **2.Decision Tree Classifier**

In [None]:
model2=DecisionTreeClassifier()
model2.fit(x_train,y_train)

In [None]:
x.columns

In [None]:
y.unique()

In [None]:
fn=x.columns
cn=['3','2','1','0']
tree.plot_tree(model2,feature_names=fn.to_list(),class_names=cn,filled=True)
plt.tight_layout()

In [None]:
y_predict2=model2.predict(x_test)
y_predict2

In [None]:
y_test.values

In [None]:
print("train_accuracy:",model2.score(x_train,y_train))
print("test_accuracy:",model2.score(x_test,y_test))

In [None]:
# hyper parameter Tuning
params={
    'criterion':['gini','entropy'],
    'max_depth':[2,3,4,5,6,7,8,9],
    'min_samples_split':[2,3,4,5,6,7,8,9],

}

In [None]:
grid=GridSearchCV(DecisionTreeClassifier(),param_grid=params,verbose=1)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model2=DecisionTreeClassifier(criterion="gini",max_depth=3,min_samples_split=8,random_state=42)
model2.fit(x_train,y_train)

In [None]:
fn=x.columns
cn=['3','2','1','0']
tree.plot_tree(model2,feature_names=fn.to_list(),class_names=cn,filled=True)
plt.tight_layout()

In [None]:
y_predict2=model2.predict(x_test)
y_predict2

In [None]:
y_test.values

In [None]:
print("train_accuracy:",model2.score(x_train,y_train))
print("test_accuracy:",model2.score(x_test,y_test))

In [None]:
class_report2=classification_report(y_test,y_predict2)
print(class_report2)

## **3.Random Forest Classifier**

In [None]:
model3=RandomForestClassifier(random_state=2)
model3.fit(x_train,y_train)

In [None]:
y_predict3=model3.predict(x_test)
y_predict3

In [None]:
y_test.values

In [None]:
print('train accuracy:',model3.score(x_train,y_train))
print('test accuracy:',model3.score(x_test,y_test))

In [None]:
# Hyper Parameter Tuning
params={
    'n_estimators':[90,100,150],
    'max_depth':[3,4,5,6,7,9],
    'min_samples_split':[2,4,6,8],
    'criterion':['gini','entropy']

    }


In [None]:
grid=GridSearchCV(estimator=model3,param_grid=params,cv=5)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model3=RandomForestClassifier(criterion="gini",max_depth=6,min_samples_split=4,n_estimators=90)
model3.fit(x_train,y_train)

In [None]:
fn=x.columns
cn=['3','2','1','0']
tree.plot_tree(model3.estimators_[0],feature_names=fn.to_list(),class_names=cn,filled=True)
plt.tight_layout()

In [None]:
model3=AdaBoostClassifier(estimator=model3,n_estimators=100,random_state=42)
model3.fit(x_train,y_train)

In [None]:
y_predict3=model3.predict(x_test)
y_predict3

In [None]:
y_test.values

In [None]:
print('train accuracy:',model3.score(x_train,y_train))
print('test accuracy:',model3.score(x_test,y_test))

In [None]:
class_report3=classification_report(y_test,y_predict3)
print(class_report3)

## 4.Gradient Boosting Classifier

In [None]:
params={
    'n_estimators':[90,110,150], # randomly select no of trees/models
    'learning_rate':[0.5,0.6,0.7,0.9,1],
    'max_depth':[3,5,7,11]
}

In [None]:
grid = GridSearchCV(GradientBoostingClassifier(),param_grid=params,verbose=1)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model4=GradientBoostingClassifier(learning_rate=0.6,max_depth=3,n_estimators=110)
model4.fit(x_train,y_train)

In [None]:
y_predict4=model4.predict(x_test)
y_predict4

In [None]:
y_test.values

In [None]:
print('train accuracy:',model4.score(x_train,y_train))
print('test accuracy:',model4.score(x_test,y_test))

In [None]:
class_report4=classification_report(y_test,y_predict4)
print(class_report4)

## **5.XGBM Classifier**

In [None]:
params={
    'n_estimators':[90,110,150],
    'learning_rate':[0.5,0.7,1],
    'lambda':[1,2,3],
    'max_depth':[3,7,11]
}

In [None]:
grid=GridSearchCV(XGBClassifier(),param_grid=params,verbose=1)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model5 = XGBClassifier(reg_lambda=1,learning_rate=0.5,max_depth=7,n_estimators=110)
model5.fit(x_train,y_train)

In [None]:
y_predict5=model5.predict(x_test)
y_predict5

In [None]:
y_test.values

In [None]:
print('train accuracy:',model5.score(x_train,y_train))
print('test accuracy:',model5.score(x_test,y_test))

In [None]:
class_report5=classification_report(y_test,y_predict5)
print(class_report5)

## **6.K Nearest Neighbors (KNN)**

In [None]:
params={'n_neighbors':[2,3,4,5,6,7,8,9,10],
       'weights':['distance','uniform']}

In [None]:
model=KNeighborsClassifier()
grid=GridSearchCV(KNeighborsClassifier(),param_grid=params)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model6=KNeighborsClassifier(n_neighbors=3,weights="distance")
model6.fit(x_train,y_train)

In [None]:
y_predict6=model6.predict(x_test)
y_predict6

In [None]:
y_test.values


In [None]:
print('train accuracy:',model6.score(x_train,y_train))
print('test accuracy:',model6.score(x_test,y_test))

In [None]:
class_report6=classification_report(y_test,y_predict6)
print(class_report6)

In [None]:
#search for an optimal value of k for knn
# range of k we want to try
k_range=range(1,41)
# empty list to store scores
k_scores=[]

for k in k_range:
  #2. run KneighborsClassifier with K neighbors
  knn=KNeighborsClassifier(n_neighbors=k)
  #3. obtain cross_val_scores for KNN with kneighbors
  scores=cross_val_score(knn,x_train,y_train,scoring='accuracy')
  #4. append mean of scores for k neighbors to k_scores list
  k_scores.append(scores.mean())
# k_scores=grid.best_score_
pd.Series(k_scores).sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
plt.plot(k_range,k_scores)
plt.xlabel("value of k for knn")
plt.ylabel('Accuracy')
plt.show()

## **7.SVM**

In [None]:
params={
    'C':[0.1,0.001,1,2,3,4,5],
    'kernel':['linear','poly','rbf'],
    'gamma':[0.1,0.001,1,2,5]
}

In [None]:
grid=GridSearchCV(SVC(),param_grid=params,verbose=1)
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
model7=SVC(C=0.1,gamma=0.1,kernel='poly')
model7.fit(x_train,y_train)

In [None]:
y_predict7=model7.predict(x_test)
y_predict7

In [None]:
y_test.values

In [None]:
print("Training accuracy:",model7.score(x_train,y_train))
print("Testing accuracy:",model7.score(x_test,y_test))

In [None]:
class_report7=classification_report(y_test,y_predict7)
print(class_report7)

In [None]:
import pickle

In [None]:
pickle.dump(model5,open("model5.pkl","wb"))

In [None]:
pickle.dump(model1,open("model1.pkl","wb"))