In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Importing all the libraries i will make use:

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

pd.set_option('display.max_columns', None)

### The data first check is performed by simply looking at the dataframe

In [None]:
df = pd.read_csv('/kaggle/input/hepatitis-c-dataset/HepatitisCdata.csv')
df = df.drop(['Unnamed: 0'], axis = 'columns')
df = df.dropna()
df

### Before going for the predictive models, I would like to answer few simple questions:
* #### How many people are there per category and sex?
* #### Age distribution of the sample population
* #### Do the disease affects more males or females in the data?

In [None]:
df.Category.value_counts(normalize = True)

In [None]:
num_male = df.Sex.value_counts().loc['m']
num_female = df.Sex.value_counts().loc['f']
print('Number of males: {}'.format(num_male))
print('Number of females: {}'.format(num_female))

### Largest part of my DataFrame is made of by healthy blood donors and only a rough 10% of people with the disease. This might be an issue for the accuracy later on in the model fitting, since that 10% is to be divided by all 3 possible stages of the disease. Male and female however in roughly 3 by 2 ratio

In [None]:
df_grp = df.groupby(['Sex', 'Category'])

In [None]:
print('Percentage of diseased females: {:.2%}'.format((num_female - len(df_grp.get_group(('f', '0=Blood Donor'))))/len(df_grp.get_group(('f', '0=Blood Donor')))))
print('Percentage of diseased males: {:.2%}'.format((num_male - len(df_grp.get_group(('m', '0=Blood Donor'))))/len(df_grp.get_group(('m', '0=Blood Donor')))))

### In the above calculation i considered the worst case scenario where a "suspect blood donor" is considered as diseased. However they impact only for a 1% in total so negligible for the purpose of the study.
### The disease occurrs more in males than femals in the dataset

In [None]:
df.describe() #i check the statistics of the df. a first glimps on how the data are distributed

### From the table above I check the statistics for each of the dataset features. There surely are some outliers but I am not going to remove them, for the following reasons:

* #### I am not a doctor in medicine, namely I do not have that sensibility to discern between features to consider important or not for the study

* #### Blood analysis values for each feature can differ in orders of magnitude between healthy and unhealthy individual and the outliers in this DataFrame may contain some important information for the models to come in order to predict the disease. 

* #### I can count only on a limited amount of data, namely 589 usable data. That is very few and most of them refer to healthy people so i mean to exploit each and every single one of them.

In [None]:
### Now its time to encode the categorical data. In this case I only have only two categorical which is the column "Sex" and "Category". The last one is also my target for the models

from sklearn.preprocessing import LabelEncoder
le_Category = LabelEncoder()
le_Sex = LabelEncoder()
dfle = df.copy()


dfle.Category = le_Category.fit_transform(dfle.Category)
dfle.Sex = le_Sex.fit_transform(dfle.Sex)

In [None]:
### I create my input "X" and target "y" datasets to be used for the models

X = dfle.drop(['Category'], axis = 'columns')
y = dfle.Category
X

In [None]:
y

### Data Modelling
#### I will create a dictionary of models and paramenters to iterate in GridSearchCV in order to be able to rank the model chosen and find the best one

In [None]:
## Data modelling

model_param = {
    'Randomforest': {
        
        'model': RandomForestClassifier(),
        'param': {
            'n_estimators':[1,5,10,15,20,25,30,40,50,60,80,100]
        }
    },
    
    'LogisticRegression':{
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'param': {
            'C': [1,5,10,15,20]
        }
        
    },
    
    'GaussianNB':{
        'model': GaussianNB(),
        'param': {
            
        }
    },
    
    'MultinomialNB':{
        'model': MultinomialNB(),
        'param': {
            
        }
    },
    
    'DecisionTreeClassifier':{
        'model': DecisionTreeClassifier(),
        'param': {
            'criterion': ['gini','entropy'],
        }
    },
    
    'SVM':{
        'model': SVC(gamma='auto'),
        'param': {
            'C': [0.001,0.1,1],
            'kernel':['rbf', 'linear']
        }
        
    }
    
}

In [None]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_param.items():
    clf = GridSearchCV(mp['model'], mp['param'], cv=5, return_train_score=None)
    clf.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
dataframe = pd.DataFrame(scores)
dataframe.sort_values(by=['best_score'], inplace = True, ascending=False)
dataframe

### The models in the first three positions have roughly the same accuracy score, which changes slightly running again the cells. I decide to make use of Logistic Regression algorithm for predictions and I want to see where this fails by using che confusion matrix.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
lr = LogisticRegression(solver='liblinear',multi_class='auto', C=1)
lr.fit(X_train,y_train)
lr_prediction = lr.predict(X_test)
score = lr.score(X_test, y_test)
print('Logistic Regression model has {:.2%} accuracy'.format(score))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, lr_prediction)
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title('LR Confusion matrix')

## Conclusions
#### From the results above, I can see that all of the methods adopted in this study can predict with the same lavel of accuracy (>>90%). The Logistic regression, Random forest classifier and SVM performs slightly better in terms of score other than approaches do.
#### However that level of accuracy refers mostly respect to blood donors than hepatitis due to the fact that most of our samples belong to healty individuals. Few errors appears when it comes of disease data ans seen in the confusion matrix. That means that in order to properly predict the disease we need more samples with that particular disease and less NaN within the dataset