In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# EDA & Preprocessing

In [None]:
data = pd.read_csv('../input/hepatitis-c-dataset/HepatitisCdata.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data = data.drop('Unnamed: 0',axis=1)

In [None]:
data['Category'].loc[data['Category'].isin(["1=Hepatitis","2=Fibrosis", "3=Cirrhosis"])] = 1
data['Category'].loc[data['Category'].isin(["0=Blood Donor", "0s=suspect Blood Donor"])] = 0
data['Sex'].loc[data['Sex']=='m']=1
data['Sex'].loc[data['Sex']=='f']=0

In [None]:
data.head()

In [None]:
data.fillna(data.median(), inplace=True)

In [None]:
data.isnull().sum()

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(18, 18))
sns.histplot(data=data, x="ALB", kde=True,ax=axes[0][0])
sns.histplot(data=data, x="ALP", kde=True,ax=axes[0][1])
sns.histplot(data=data, x="ALT", kde=True,ax=axes[1][0])
sns.histplot(data=data, x="AST", kde=True,ax=axes[1][1])
sns.histplot(data=data, x="BIL", kde=True,ax=axes[2][0])
sns.histplot(data=data, x="CHE", kde=True,ax=axes[2][1])
sns.histplot(data=data, x="CHOL", kde=True,ax=axes[3][0])
sns.histplot(data=data, x="CREA", kde=True,ax=axes[3][1])
sns.histplot(data=data, x="GGT", kde=True,ax=axes[4][0])
sns.histplot(data=data, x="PROT", kde=True,ax=axes[4][1])

In [None]:
labels =data['Category'].value_counts(sort = True).index
sizes = data['Category'].value_counts(sort = True)
colors = ["Red","Blue"]
plt.figure(figsize=(7,7))
plt.pie(sizes,labels=labels, colors=colors, autopct='%1.1f%%', startangle=90,)
plt.title('Category pie')
plt.show()

In [None]:
data.corr()

In [None]:
sns.pairplot(data, diag_kind="kde",hue="Category")

In [None]:
data = pd.get_dummies(data, columns = ['Sex'],drop_first=True)
data.head()

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

In [None]:
robust_sc = preprocessing.RobustScaler()
standard_sc = preprocessing.StandardScaler() 
minmax_sc = preprocessing.MinMaxScaler()

In [None]:
X = data.drop(['Category'],axis=1)
y = data["Category"]

In [None]:
for x in [robust_sc,standard_sc,minmax_sc]:
    %time
    resultado = []
    scaler = x.fit(X)
    X_new = x.transform(X)
    tree = DecisionTreeClassifier(max_depth=25,random_state=42)
    tree.fit(X_new,y)
    y_pred = tree.predict(X_new)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",x)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
over = SMOTE()
overs = RandomOverSampler()
under = RandomUnderSampler()
steps = [('o', over), ('os', overs),('u',under)]
pipeline = Pipeline(steps=steps)

In [None]:
X_train, y_train = pipeline.fit_resample(X_train, y_train)

In [None]:
X_train=standard_sc.fit_transform(X_train)
X_test=standard_sc.transform(X_test)

In [None]:
def confusion(y_test,y_test_pred,X):
    names=['Non Hepatitis','Hepatitis']
    cm=confusion_matrix(y_test,y_test_pred)
    f,ax=plt.subplots(figsize=(10,10))
    sns.heatmap(cm,annot=True,linewidth=.5,linecolor="r",fmt=".0f",ax=ax)
    plt.title(X, size = 25)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    plt.show()

    return

# Machine Learning

In [None]:
RF = RandomForestClassifier(random_state=42)
RF.fit(X_train, y_train)
pred = RF.predict(X_test)
score = RF.score(X_test,y_test)

In [None]:
score

In [None]:
confusion(y_test,pred,"RF")

In [None]:
feat_importances = pd.Series(RF.feature_importances_, index=data.drop('Category',axis=1).columns)
feat_importances.nlargest(5).plot(kind='barh')

## XGB

In [None]:
gbm = XGBClassifier(verbosity=1)
params_xgb = {
        "n_estimators":[500,1000,1500],
        "learning_rate":[0.1,0.3,0.6],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=5,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_xgb))

In [None]:
confusion(y_test,y_test_pred_xgb,"XGB")

## MLP

In [None]:
clf = MLPClassifier(random_state=42)
params_MLP = {
        "hidden_layer_sizes":[64,128,256],
        "activation":["identity", "logistic", "tanh", "relu"],
        'solver': ["lbfgs", "sgd", "adam"],
        "learning_rate":["constant", "invscaling", "adaptive"],
        'max_iter': [100,200],
        "warm_start":[True]
}

In [None]:
model_MLP = GridSearchCV(clf,param_grid=params_MLP, cv=3,n_jobs=-1)
model_MLP.fit(X_train,y_train)


In [None]:
print("Best params: "+str(model_MLP.best_params_))
print("Best Score: "+str(model_MLP.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_MLP.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_MLP = model_MLP.predict(X_train)
y_test_pred_MLP = model_MLP.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_MLP))

In [None]:
confusion(y_test,y_test_pred_MLP,"MLP")

## Random Forest

In [None]:
clf = RandomForestClassifier(random_state=42)
params_RF = {
        "max_depth":[250,500,1000],
        "criterion":["gini", "entropy"],
        'min_samples_split': [2,4,6],
        "min_samples_leaf":[1,2,3],
        "max_features":['auto', 'sqrt', 'log2'],
        'warm_start':[True],
        'class_weight':['balanced', 'balanced_subsample']
}

In [None]:
model_RF = GridSearchCV(clf,param_grid=params_RF, cv=3,n_jobs=-1)
model_RF.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_RF.best_params_))
print("Best Score: "+str(model_RF.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_RF.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_RF = model_RF.predict(X_train)
y_test_pred_RF = model_RF.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_RF))

In [None]:
confusion(y_test,y_test_pred_RF,"RF")