In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import re

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/projet_bio_info/data/genes_for_clustering.csv')

In [None]:
df.head()

Unnamed: 0,GeneID,desc_length,chromosome,arm_encoded,chrom_encoded,type_biological-region,type_ncRNA,type_other,type_protein-coding,type_pseudo,type_rRNA,type_scRNA,type_snRNA,type_snoRNA,type_tRNA,type_unknown,Symbol,type_of_gene,description
0,1,-0.881502,1.584148,0.702056,1.322565,-0.3855,-0.645132,-0.10092,1.591189,-0.562545,-0.02635,-0.007526,-0.046444,-0.131086,-0.085961,-0.12752,A1BG,protein-coding,alpha-1-B glycoprotein
1,2,-0.963013,0.429485,-1.407337,0.28444,-0.3855,-0.645132,-0.10092,1.591189,-0.562545,-0.02635,-0.007526,-0.046444,-0.131086,-0.085961,-0.12752,A2M,protein-coding,alpha-2-macroglobulin
2,9,-0.963013,-0.230323,-1.407337,-0.308775,-0.3855,-0.645132,-0.10092,1.591189,-0.562545,-0.02635,-0.007526,-0.046444,-0.131086,-0.085961,-0.12752,NAT1,protein-coding,N-acetyltransferase 1
3,10,-0.963013,-0.230323,-1.407337,-0.308775,-0.3855,-0.645132,-0.10092,1.591189,-0.562545,-0.02635,-0.007526,-0.046444,-0.131086,-0.085961,-0.12752,NAT2,protein-coding,N-acetyltransferase 2
4,11,-0.229413,-0.230323,-1.407337,-0.308775,-0.3855,-0.645132,-0.10092,-0.628461,1.777636,-0.02635,-0.007526,-0.046444,-0.131086,-0.085961,-0.12752,NATP,pseudo,N-acetyltransferase pseudogene


In [None]:
categorical_maps = {}
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    categorical_maps[col] = dict(zip(le.transform(le.classes_), le.classes_))
print(categorical_maps)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
#Elle enregistre l‚Äôobjet Python categorical_maps (un dictionnaire contenant les correspondances entre les valeurs num√©riques et les cat√©gories d‚Äôorigine) dans un fichier nomm√© "categorical_maps.pkl".
#joblib.dump(categorical_maps, "/content/drive/MyDrive/Colab_Notebooks/projet_bio_info/categorical_maps.pkl")

['/content/drive/MyDrive/Colab_Notebooks/projet_bio_info/categorical_maps.pkl']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
#joblib.dump(scaler, "/content/drive/MyDrive/Colab_Notebooks/projet_bio_info/scaler.pkl")

In [None]:
def gap_statistic(X, n_refs=20, max_clusters=10):
    gaps = np.zeros(max_clusters-1)
    for k in range(1, max_clusters):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        inertia_k = kmeans.inertia_

        # G√©n√©rer des donn√©es de r√©f√©rence uniformes
        ref_inertia = np.zeros(n_refs)
        for i in range(n_refs):
            random_data = np.random.rand(*X.shape)  # Donn√©es de r√©f√©rence al√©atoires
            kmeans_ref = KMeans(n_clusters=k)
            kmeans_ref.fit(random_data)
            ref_inertia[i] = kmeans_ref.inertia_

        # Calculer la statistique Gap
        gaps[k-1] = np.log(np.mean(ref_inertia)) - np.log(inertia_k)

    # Trouver le "gap" maximal
    optimal_k = np.argmax(gaps) + 1
    return optimal_k

# Utiliser la Gap Statistic
optimal_clusters = gap_statistic(df)
print(f"Le nombre optimal de clusters est : {optimal_clusters}")

Le nombre optimal de clusters est : 9


In [None]:
from sklearn.mixture import GaussianMixture

# Gaussian Mixture Model
print(f"Appliquer Gaussian Mixture Model avec {optimal_clusters} clusters...")
gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
clusters_gmm = gmm.fit_predict(X_scaled)

# Ajouter les clusters au dataframe
df["gmm_cluster"] = clusters_gmm

# Sauvegarder le mod√®le
#joblib.dump(gmm, "/content/drive/MyDrive/Colab_Notebooks/projet_bio_info/les_autres/gmm_model.pkl")
#print(f"Mod√®le Gaussian Mixture Model sauvegard√©.")


Appliquer Gaussian Mixture Model avec 9 clusters...
Mod√®le Gaussian Mixture Model sauvegard√©.


In [None]:

# √âVALUATION DU MOD√àLE GMM
print("\n" + "="*50)
print("üìä √âVALUATION DU MOD√àLE GAUSSIAN MIXTURE")
print("="*50)

# Silhouette Score
silhouette = silhouette_score(X_scaled, clusters_gmm)

if silhouette < 0.2:
    silhouette_eval = "üî¥ Mauvais regroupement"
elif silhouette < 0.5:
    silhouette_eval = "üü° S√©paration moyenne"
else:
    silhouette_eval = "üü¢ Bonne s√©paration"

print(f"Silhouette Score : {silhouette:.4f}  ({silhouette_eval})")

# Score BIC (Bayesian Information Criterion) - plus bas = mieux
bic_score = gmm.bic(X_scaled)
print(f"BIC Score : {bic_score:.2f}")



üìä √âVALUATION DU MOD√àLE GAUSSIAN MIXTURE
Silhouette Score : 0.2669  (üü° S√©paration moyenne)
BIC Score : -8229898.04
