In [2]:
print("""
PROBLEM STATEMENT: TASK 2
The primary objective of this study is to employ K-means clustering to group patients based on their health profiles, utilizing variables such as age, sex, and various health indicators (ALB, ALP, AST, BIL, CHE, CHOL, CREA). The target variable for clustering is the "Category," representing different health conditions with values '0=Blood Donor', '0s=Suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis'. The goal is to identify clusters of patients with similar health characteristics and observe if these clusters align with the predefined health categories. This analysis aims to reveal patterns and associations between health indicators and specific health conditions, providing insights into potential risk factors and aiding in the development of targeted healthcare strategies.
""")


PROBLEM STATEMENT: TASK 2
The primary objective of this study is to employ K-means clustering to group patients based on their health profiles, utilizing variables such as age, sex, and various health indicators (ALB, ALP, AST, BIL, CHE, CHOL, CREA). The target variable for clustering is the "Category," representing different health conditions with values '0=Blood Donor', '0s=Suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis'. The goal is to identify clusters of patients with similar health characteristics and observe if these clusters align with the predefined health categories. This analysis aims to reveal patterns and associations between health indicators and specific health conditions, providing insights into potential risk factors and aiding in the development of targeted healthcare strategies.



In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/hcv+data.zip")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [None]:
df["Category"].unique()

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

In [None]:
df.isna().sum()

Unnamed: 0     0
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
dtype: int64

In [None]:
df=df.dropna()

In [None]:
df

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.80,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.20,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,609,3=Cirrhosis,58,f,34.0,46.4,15.0,150.0,8.0,6.26,3.98,56.0,49.7,80.6
609,610,3=Cirrhosis,59,f,39.0,51.3,19.6,285.8,40.0,5.77,4.51,136.1,101.1,70.5
610,611,3=Cirrhosis,62,f,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5
611,612,3=Cirrhosis,64,f,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3


In [None]:
df.columns

Index(['Unnamed: 0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST',
       'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT'],
      dtype='object')

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

numeric_features = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
categorical_features = ['Category', 'Sex']
scaler = StandardScaler()
onehot = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('cat', onehot, categorical_features)
    ])
best_k = None
best_score = -1
for k in range(2, 40):
    kmeans = KMeans(n_clusters=k, random_state=42)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('kmeans', kmeans)
    ])
    pipeline.fit(df)
    labels = pipeline.named_steps['kmeans'].labels_
    score = silhouette_score(df[numeric_features], labels)

    if score > best_score:
        best_k = k
        best_score = score

final_kmeans = KMeans(n_clusters=best_k, random_state=42)
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kmeans', final_kmeans)
])

final_pipeline.fit(df)
print(df)
df['cluster_labels'] = final_pipeline.named_steps['kmeans'].labels_
print(df)
print(f"The best k value is: {best_k}")




     Unnamed: 0       Category  Age Sex   ALB    ALP   ALT    AST   BIL  \
0             1  0=Blood Donor   32   m  38.5   52.5   7.7   22.1   7.5   
1             2  0=Blood Donor   32   m  38.5   70.3  18.0   24.7   3.9   
2             3  0=Blood Donor   32   m  46.9   74.7  36.2   52.6   6.1   
3             4  0=Blood Donor   32   m  43.2   52.0  30.6   22.6  18.9   
4             5  0=Blood Donor   32   m  39.2   74.1  32.6   24.8   9.6   
..          ...            ...  ...  ..   ...    ...   ...    ...   ...   
608         609    3=Cirrhosis   58   f  34.0   46.4  15.0  150.0   8.0   
609         610    3=Cirrhosis   59   f  39.0   51.3  19.6  285.8  40.0   
610         611    3=Cirrhosis   62   f  32.0  416.6   5.9  110.3  50.0   
611         612    3=Cirrhosis   64   f  24.0  102.8   2.9   44.4  20.0   
612         613    3=Cirrhosis   64   f  29.0   87.3   3.5   99.0  48.0   

       CHE  CHOL   CREA    GGT  PROT  cluster_labels  
0     6.93  3.23  106.0   12.1  69.0        

In [42]:
df

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,cluster_labels
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0,0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.80,74.0,15.6,76.5,0
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.20,86.0,33.2,79.3,0
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,609,3=Cirrhosis,58,f,34.0,46.4,15.0,150.0,8.0,6.26,3.98,56.0,49.7,80.6,0
609,610,3=Cirrhosis,59,f,39.0,51.3,19.6,285.8,40.0,5.77,4.51,136.1,101.1,70.5,1
610,611,3=Cirrhosis,62,f,32.0,416.6,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5,1
611,612,3=Cirrhosis,64,f,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3,1


In [44]:
new = pd.read_excel("/content/new.xlsx")
new

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,9,0=Blood Donor,32,m,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83,13.7,71.3
1,10,0=Blood Donor,32,m,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81,15.9,69.9
2,11,0=Blood Donor,32,m,44.3,52.3,21.7,22.4,17.2,4.15,3.57,78,24.1,75.4


In [46]:
df.columns

Index(['Unnamed: 0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST',
       'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'cluster_labels'],
      dtype='object')

In [48]:
column1 = df[['Age', 'Sex']]

In [49]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in column1:
    new[i]=le.fit_transform(new[i])

In [55]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns_to_scale = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']

# Assuming 'new' is your DataFrame
new[columns_to_scale] = scaler.fit_transform(new[columns_to_scale])


In [56]:
new

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,9,0=Blood Donor,0,0,1.381778,-0.18101,1.238577,0.0,-1.100175,1.357516,0.165858,1.13555,-0.938523,-0.385636
1,10,0=Blood Donor,0,0,-0.951688,1.305176,-1.210427,-1.224745,1.31964,-0.335448,1.133364,0.162221,-0.446916,-0.985513
2,11,0=Blood Donor,0,0,-0.43009,-1.124166,-0.028149,1.224745,-0.219465,-1.022068,-1.299222,-1.297771,1.385439,1.371149


In [58]:
final_pipeline.fit(new)
final_pipeline.predict(new)



array([1, 1, 0], dtype=int32)

In [5]:
print("""
Interpretation:-

Cluster 0
Cluster 0 appears to consist of individuals who share characteristics indicative of a healthy state or blood donors. The biomarker values within this cluster, including albumin (ALB), alkaline phosphatase (ALP), and total protein (PROT), fall within moderate to high normal ranges. Liver enzyme levels such as alanine transaminase (ALT), aspartate transaminase (AST), and gamma-glutamyl transferase (GGT) are generally within acceptable limits. These individuals may represent a group of healthy donors or individuals with no significant liver-related health issues.

Cluster 1
Cluster 1 suggests a population with potential liver-related health concerns, possibly indicative of cirrhosis. Individuals in this cluster exhibit elevated levels of liver enzymes (ALT, AST, GGT), pointing to liver stress or damage. Bilirubin (BIL) levels are also higher than normal, suggesting impaired liver function. While cholesterol (CHOL) levels vary, some individuals may have abnormal values. This cluster likely represents a cohort with liver-related health issues, and further medical investigation and confirmation are advisable.
""")



Interpretation:-

Cluster 0
Cluster 0 appears to consist of individuals who share characteristics indicative of a healthy state or blood donors. The biomarker values within this cluster, including albumin (ALB), alkaline phosphatase (ALP), and total protein (PROT), fall within moderate to high normal ranges. Liver enzyme levels such as alanine transaminase (ALT), aspartate transaminase (AST), and gamma-glutamyl transferase (GGT) are generally within acceptable limits. These individuals may represent a group of healthy donors or individuals with no significant liver-related health issues.

Cluster 1
Cluster 1 suggests a population with potential liver-related health concerns, possibly indicative of cirrhosis. Individuals in this cluster exhibit elevated levels of liver enzymes (ALT, AST, GGT), pointing to liver stress or damage. Bilirubin (BIL) levels are also higher than normal, suggesting impaired liver function. While cholesterol (CHOL) levels vary, some individuals may have abnorma