# <center>Hepatitis C Virus (HCV) for Egyptian patients<br>Clustering with 3 models</center>

### Libraries 

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set_style("whitegrid")

## Data exploration 

In [None]:
data = pd.read_csv("../input/hepatitis-c-virus-for-egyptian-patients-data-set/HCV-Egy-Data.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
scaler = StandardScaler()
data_scale = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

## Unsupervised machine learning models

### KMeans model

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,14))
visualizer.fit(data)
visualizer.poof()

In [None]:
kmeans = KMeans(n_clusters=8)
clusters = kmeans.fit(data_scale)
data['Cluster'] = clusters.predict(data_scale)
data_scale['Cluster'] = clusters.predict(data_scale)

In [None]:
data_scale['Cluster'].value_counts(sort=False)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
pca = PCA()
pca.fit(data_scale)
pca_x = pca.transform(data_scale)
pca_df = pd.DataFrame([pca_x[:, 0], pca_x[:, 1]]).T
pca_df.columns = ['PC1', 'PC2']

In [None]:
pca_df = pd.concat([pca_df, data_scale['Cluster']], axis=1)
sns.lmplot('PC1', 'PC2', data=pca_df, hue='Cluster', fit_reg=False)

In [None]:
data.groupby('Cluster').median()

In [None]:
data_scale.drop(columns=['Cluster'], inplace=True)
data.drop(columns=['Cluster'], inplace=True)

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(data_scale, kmeans.predict(data_scale))

The Kmeans model classified patients to 8 groups of various symptoms.<br><br>
Cluster 0 : Men with fever, headache, diarrhea, fatigue and epigastric pain<br>
Cluster 1 : Men with headache, fatigue, jaundice and epigastric pain<br>
Cluster 2 : Women with fever, diarrhea, fatigue and jaundice<br>
Cluster 3 : Women with fever and nausea/vomiting<br>
Cluster 4 : Men with headache, fatigue, jaundice and epigastric pain<br>
Cluster 5 : Men with fever, nausea/vomiting and headache<br>
Cluster 6 : Women and men with fever and epigastric pain<br>
Cluster 7 : Men with nausea/vomiting, headache, diarrhea and jaundice<br>

### DBscan model

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.01)
dbscan.fit(data_scale)
data_scale['Cluster2'] = dbscan.labels_

In [None]:
pca_df = pd.concat([pca_df, data_scale['Cluster2']], axis=1)
sns.lmplot('PC1', 'PC2', data=pca_df, hue='Cluster2', fit_reg=False)

In [None]:
data_scale.drop(columns=['Cluster2'], inplace=True)

Conclusion : model failed to cluster the data

### Gaussian mixture model 

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=8).fit(data_scale)
data['Cluster3']= gmm.fit_predict(data_scale)

In [None]:
pca_df = pd.concat([pca_df, data['Cluster3']], axis=1)
sns.lmplot('PC1', 'PC2', data=pca_df, hue='Cluster3', fit_reg=False)

In [None]:
data.groupby('Cluster3').median()

In [None]:
data.drop(columns=['Cluster3'], inplace=True)

In [None]:
silhouette_score(data_scale, gmm.predict(data_scale), metric='euclidean')

Conclusion : Low silhouette score, clusters imbricated, vizualisation show irrelevance