In [None]:
import requests
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  
import matplotlib.pyplot as plt
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import PCA

In [None]:
# fetch dataset 
hcv_data = fetch_ucirepo(id=571)

# data (as pandas dataframes) 
X = hcv_data.data.features 
y = hcv_data.data.targets

# metadata 
print(hcv_data.metadata)

In [None]:
# variable information 
print(hcv_data.variables)

In [None]:
# Identify categorical and numerical columns
categorical_cols = ['Sex']  # Replace with your actual categorical columns
numerical_cols = X.columns.difference(categorical_cols + ['ID'])  # Exclude 'ID' and categorical columns

In [None]:
 # Define the preprocessing pipelines for both numerical and categorical data
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent
    ('encoder', OneHotEncoder(drop='if_binary'))  # Encode binary categorical features
])

In [None]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [None]:
# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
# Convert preprocessed data back to a DataFrame for easy manipulation and saving
X_preprocessed_df = pd.DataFrame(X_preprocessed, index=X.index,
                                 columns=(numerical_cols.tolist() + preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols).tolist()))

In [None]:
# Save the preprocessed data to a new CSV file
X_preprocessed_df.to_csv('preprocessed_hcv_data.csv', index=False)

In [None]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_preprocessed_df)

In [None]:
# Convert y to a numpy array
y_array = np.array(y).flatten() if isinstance(y, pd.DataFrame) else y

In [None]:
# Create scatter plot of the first two principal components
plt.figure(figsize=(12, 6))
colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple']

# Ensure unique labels are sorted or in expected order
unique_labels = sorted(np.unique(y_array))
color_map = dict(zip(unique_labels, colors))

In [None]:
# Scatter plot
for label in unique_labels:
    index = y_array == label
    plt.scatter(X_pca[index, 0], X_pca[index, 1], color=color_map[label], alpha=.8, label=label)

plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of HCV dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

In [None]:
# Scree plot
plt.figure(figsize=(6, 4))
pca_variance = pca.explained_variance_ratio_
plt.plot(np.cumsum(pca_variance))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')

plt.show()

In [None]:
print(pca_variance)

In [None]:
#As the variance reaches only about 35 percent let's try with 4 components
# Apply PCA with 4 components
pca_4 = PCA(n_components=4)
X_pca_4 = pca_4.fit_transform(X_preprocessed_df)

# Calculate the explained variance
pca_variance_4 = pca_4.explained_variance_ratio_

# Print the explained variance for each component
print("Explained variance for each component:", pca_variance_4)

# Create an updated scree plot with 4 components
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca_variance_4) + 1), np.cumsum(pca_variance_4), marker='o')
plt.title('Scree Plot with 4 Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.xticks(range(1, len(pca_variance_4) + 1))
plt.show()


In [None]:
#With 4 components the variance reaches around 60 and in this case it could be more useful for further works with PCA

In [None]:
#t-SNE

from sklearn.manifold import TSNE

# Flatten 'y' to a 1D array if it's a column vector
y_flat = y.values.ravel() if isinstance(y, pd.DataFrame) else y

# Now use 'y_flat' for label encoding and plotting
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_flat)


# Initialize t-SNE with a reasonable perplexity value
tsne = TSNE(n_components=2, perplexity=36, random_state=42)

# Apply t-SNE to the preprocessed data
X_tsne = tsne.fit_transform(X_preprocessed_df)

# Create a scatter plot for t-SNE projections
plt.figure(figsize=(10, 10))
unique_labels = np.unique(y_encoded)
colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple', 'green'] 

for i, label in enumerate(unique_labels):
    plt.scatter(X_tsne[y_encoded == label, 0], X_tsne[y_encoded == label, 1], 
                color=colors[i], label=label_encoder.inverse_transform([label])[0])

plt.legend()
plt.title('t-SNE projection of HCV dataset')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.show()


In [None]:
# k-means
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

max_k = 5  

# Iterate over the range of k values
for k in range(2, max_k + 1):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_preprocessed_df)  
    cluster_labels = kmeans.labels_
    
    # Project the cluster labels back onto the PCA-reduced data for visualization
    plt.figure(figsize=(12, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
    plt.colorbar(ticks=range(k))
    plt.title(f'k-Means Clustering with k={k}')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()
    
    # If k equals the true number of classes, create a confusion matrix
    if k == len(np.unique(y_encoded)):  # Update this condition if you know the true number of classes
        cm = confusion_matrix(y_encoded, cluster_labels)
        plt.figure(figsize=(8, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for k={k}')
        plt.xlabel('Predicted Class')
        plt.ylabel('True Class')
        plt.show()



In [None]:
# k-menas analysis/interpretation:

#The diagonal cells on the confusion matrix represent the number of points for which the predicted cluster matches the actual class (true positives).
#Off-diagonal cells show where points from one class were placed into a different cluster (false positives/negatives).
#From the matrix, it seems that one class (likely the majority class) is dominating two clusters (0 and 1), suggesting that k-means is having difficulty distinguishing between some of the classes. The other classes appear to be more evenly distributed across clusters, but with no clear one-to-one correspondence, indicating some degree of misclassification.

In [None]:
#Let's advance with AHC

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt

# Compute the pairwise distances of the preprocessed data
dist_matrix = pairwise_distances(X_preprocessed_df)

# Perform AHC with UPGMA linkage
upgma_linkage = linkage(dist_matrix, method='average')

# Perform AHC with complete linkage
complete_linkage = linkage(dist_matrix, method='complete')

# Truncated dendrogram for UPGMA linkage
plt.figure(figsize=(12, 6))
dendrogram(
    upgma_linkage,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=10,  # show only the last 10 merged clusters
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.title('Truncated Dendrogram for AHC using UPGMA Linkage')
plt.xlabel('Cluster size')
plt.ylabel('Distance')
plt.show()

# Truncated dendrogram for Complete linkage
plt.figure(figsize=(12, 6))
dendrogram(
    complete_linkage,
    truncate_mode='lastp',  # same as above
    p=10,
    show_leaf_counts=False,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
)
plt.title('Truncated Dendrogram for AHC using Complete Linkage')
plt.xlabel('Cluster size')
plt.ylabel('Distance')
plt.show()


In [None]:
#Let's advance with the last part on this dataset: SOM

In [None]:
from minisom import MiniSom

# Initialize and train the SOM
som = MiniSom(x=10, y=10, input_len=X_preprocessed_df.shape[1], sigma=0.7, learning_rate=0.35)
som.train_random(X_preprocessed_df.values, num_iteration=5000)

# Visualize the SOM with a U-matrix
plt.figure(figsize=(10, 10))
plt.pcolor(som.distance_map().T, cmap='bone_r')  # distance map as background
plt.colorbar()

# Overlay with class labels
markers = ['o', 's', 'D', '+', 'x']  # as many markers as there are classes
colors = ['r', 'g', 'b', 'c', 'm']   # colors for the markers
for cnt, xx in enumerate(X_preprocessed_df.values):
    w = som.winner(xx)  # getting the winner
    # place a marker on the winning position for the sample xx
    plt.plot(w[0]+.5, w[1]+.5, markers[y_encoded[cnt]], markerfacecolor='None',
             markeredgecolor=colors[y_encoded[cnt]], markersize=12, markeredgewidth=2)

plt.show()


In [None]:
# Plot the component planes
num_features = X_preprocessed_df.shape[1]
fig, axs = plt.subplots(num_features, figsize=(num_features*5, 250))
for i, ax in enumerate(axs):
    ax.pcolor(som.get_weights()[:, :, i], cmap='coolwarm')
    ax.set_title(f'Component plane for feature {i}')
plt.show()