**Attributes 1 to 4 refer to the data of the patient:**
1. X (Patient ID/No.)
2. Category (diagnosis) (values: '0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis')
3. Age (in years)
4. Sex (f,m)
<br>**Attributes 5 to 14 refer to laboratory data:**
5. ALB (Albumin Blood Test)
6. ALP (Alkaline phosphatase)
7. ALT (Alanine Transaminase)
8. AST (Aspartate Transaminase)
9. BIL (Bilirubin)
10. CHE (Acetylcholinesterase)
11. CHOL (Cholesterol)
12. CREA (Creatinine)
13. GGT (Gamma-Glutamyl Transferase)
14. PROT (Proteins)

The target attribute for classification is Category (2): blood donors vs. Hepatitis C patients (including its progress ('just' Hepatitis C, Fibrosis, Cirrhosis).

In [None]:
#Modules for EDA
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from tensorflow import keras
%matplotlib inline

In [None]:
df = pd.read_csv('../input/hepatitis-c-dataset/HepatitisCdata.csv')
df.shape

In [None]:
df.info()

# **Missing Values**

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.describe()

In [None]:
df.fillna(df.mean(),inplace=True)
df.describe()

In [None]:
plt.figure(figsize=(7,7))
plt.title('Age Histogram')
sns.histplot(df['Age'])
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.title('Age Histogram with Category')
sns.histplot(df[['Age','Category']],x='Age',hue='Category')
plt.show()

In [None]:
plt.figure(figsize=(7,7))
sns.pairplot(data=df.drop(['Age','Sex'],axis=1),hue='Category')
plt.title('Pairplot')
plt.show()

# **Category Pie Chart**

In [None]:
plt.figure(figsize=(7,7))
df['Category'].value_counts().plot(kind='pie',autopct='%.2f')
plt.title('Category Pie Chart')
plt.show()

df['Category'].value_counts()

# **Imbalanced Dataset :(**

# **Replacing Category values into numerics**

In [None]:
df.Category.unique()

In [None]:
replace_dict = dict()
num_dict = dict()
i=0
for val in df['Category'].unique():
    replace_dict[val] = i
    num_dict[str(i)] = val
    i += 1

In [None]:
df['Category'].replace(replace_dict,inplace=True)
df.Category.unique()

# **Replacing Gender Values to 0 and 1**

In [None]:
df['Sex'].value_counts()

In [None]:
df['Sex'].replace(['m','f'],[1,0],inplace=True)
df['Sex'].value_counts()

# **Oversampling the data using SMOTE Method**

In [None]:
x,y = df.drop('Category',axis=1),df['Category']

In [None]:
smote = SMOTE()
x,y = smote.fit_resample(x,y)

In [None]:
plt.figure(figsize=(7,7))
y.value_counts().plot(kind='pie',autopct='%.2f')
plt.title('Category Pie Chart')
plt.show()
y.value_counts()

# **Feature Scaling**

In [None]:
cols_to_scale = x.drop('Sex',axis=1).columns
cols_to_scale

In [None]:
scale = MinMaxScaler()
scalled = scale.fit_transform(x[cols_to_scale])

In [None]:
i = 0
for col in cols_to_scale:
    x[col] = scalled[:,i]
    i += 1

In [None]:
x.head()

# **Train Test Split**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# **Model Building and Training**

In [None]:
model = keras.Sequential([
    keras.layers.Dense(12,input_shape=(12,),activation='relu'),
    keras.layers.Dense(5,activation='softmax')
])

model.compile(
    optimizer='adam',
    metrics=['accuracy'],
    loss='sparse_categorical_crossentropy'
)

In [None]:
model.fit(x_train,y_train,epochs=200)

In [None]:
model.evaluate(x_test,y_test)

# **Predicions**

In [None]:
def predict(model,x):
    pred = model.predict(x)
    pred_test = np.array([np.argmax(pred[i]) for i in range(len(pred))])
    return pred_test

def plot_actual_vs_predicted(y_true,y_pred,title):
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(7,7))
    sns.heatmap(cm,annot=True,fmt='g',
                xticklabels=num_dict.values(),yticklabels=num_dict.values())
    plt.title(title)
    plt.show()
    print("Classification Report")
    print(classification_report(y_true,y_pred))

# **Test Data Predictions**

In [None]:
y_test_pred = predict(model,x_test)
plot_actual_vs_predicted(y_test,y_test_pred,"Test Data Predictions")

# **Train Data Predictions**

In [None]:
y_train_pred = predict(model,x_train)
plot_actual_vs_predicted(y_train,y_train_pred,"Train Data Predictions")