<table>
<tr>                                                                                   
     <th>
         <div style='padding:15px;color:#030aa7;font-size:240%;text-align: center;font-style: italic;font-weight: bold;font-family: Georgia, serif'><a href="https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data">Breast Cancer Wisconsin<br>Analyse exploratoire des données</a></div>
     </th>
     <th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/breast_cancer_logo.png" width="96"></th>
 </tr>
</table>

<div style='text-align: center'>
<img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/breast_cancer.png" width="512">
</div>

<div style='padding:15px;color:#030aa7;font-size:100%;text-align: left;font-family: Georgia, serif'>Les variables sont calculées à partir d'une image numérisée d'une ponction d'aspiration à l'aiguille fine (FNA) utilisée pour obtenir des échantillons à partir de ganglions lymphatique. Elles décrivent les caractéristiques des noyaux cellulaires présents sur l'image.</div>

<div style='padding:15px;color:#030aa7;font-size:100%;text-align: left;font-family: Georgia, serif'><a href="https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic">Veuillez vous référer à la page <span style="font-weight: bold; color: blue">UC Irvine Machine Learning Repository</span>
 officielle pour plus de détails.</a></div>

# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Introduction</div></b>
## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Import libriries </div></b>

In [None]:
import pandas as pd, numpy as np, seaborn as sns, warnings, os
from datetime import datetime as dt
from matplotlib import pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.patheffects as path_effects

import plotly.express as px
import plotly.graph_objs as go

font1 = fm.FontProperties(size=20)
font2 = fm.FontProperties(size=24)

warnings.filterwarnings(action="ignore")

if int(str(sns.__version__).split('.')[1]) > 8 : 
    plt.style.use('seaborn-v0_8-darkgrid')
else:
    plt.style.use('seaborn-darkgrid')
sns.set(font_scale=3)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Outils du document</div></b>

In [None]:
palette = [ "#030aa7", "#e50000", "#d8863b", "#005f6a", "#6b7c85", "#751973", 
            "#0485d1", "#ff7855", "#fbeeac", "#0cb577", "#95a3a6", "#c071fe", 
            "#d1e5f0", "#fddbc7", "#ffffcb", "#12e193", "#d8dcd6", "#dfc5fe", 
          ]
sns.palplot(sns.color_palette(palette))

In [None]:
repertoireRacine  = '.'
nomProjet         = 'Breast Cancer Wisconsin'

repertoireProjet  = os.path.join(repertoireRacine, nomProjet)
repertoireDonnees = os.path.join(repertoireProjet, 'repertoire.donnees')
repertoireImages  = os.path.join(repertoireProjet, 'repertoire.images')


def controleExistenceRepertoire( repertoire, create_if_needed=True):
    """Voir si le répertoire existe. S'il n'existe pas il est créé."""
    path_exists = os.path.exists(repertoire)
    if path_exists:
        if not os.path.isdir(repertoire):
            raise Exception("Trouvé le nom  "+repertoire +" mais c'est un fichier, pas un répertoire")
            # return False
        return True
    if create_if_needed:
        os.makedirs(repertoire)

def sauvegarderImage( fichier):
    """Enregistrez la figure. Appelez la méthode juste avant plt.show ()."""
    controleExistenceRepertoire(repertoireImages)
    plt.savefig(os.path.join(repertoireImages,
                             fichier+f"--{dt.now().strftime('%Y_%m_%d_%H.%M.%S')}.png"), 
                             dpi=600, 
                             bbox_inches='tight')

def sauvegarderImageSNS( sns_plot, fichier):
    """Enregistrez la figure. Appelez la méthode juste avant plt.show ()."""
    controleExistenceRepertoire(repertoireImages)
    fig = sns_plot.get_figure()
    fig.savefig(os.path.join(repertoireImages,fichier+'.png'))
    
controleExistenceRepertoire(repertoireProjet);
controleExistenceRepertoire(repertoireDonnees);
controleExistenceRepertoire(repertoireImages);

In [None]:
def formatPct(pct, allvals):
    total = int(round(pct/100. * np.sum(allvals)))
    return "{:.2f}%\n({:d})".format(pct, total)    

In [None]:
def affichageDistribution(colonne,couleur,ax):
    graph = sns.distplot(colonne, color=couleur, ax=ax)
    graph.set(ylabel=None)
    moyenne, mediane = float(colonne.mean()), \
                   float(colonne.median())
    
    ax.axvline(moyenne, color='g', linestyle='-', label="mean   = {0:0.1f}".format(moyenne), lw=2)
    ax.axvline(mediane, color='b', linestyle='--', label="median = {0:0.1f}".format(mediane), lw=2)
    graph.legend(loc="upper right")

In [None]:
def afficheDendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Classification Hiérarchique Ascendante')
        # plt.xlabel('Villes ou (taille du cluster)')
        plt.ylabel('Distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
def add_median_labels(ax, precision='.1f'):
    lines = ax.get_lines()
    # determine number of lines per box (this varies with/without fliers)
    boxes = [c for c in ax.get_children() if type(c).__name__ == 'PathPatch']
    lines_per_box = int(len(lines) / len(boxes))
    # iterate over median lines
    for median in lines[4:len(lines):lines_per_box]:
        # display median value at center of median line
        x, y = (data.mean() for data in median.get_data())
        # choose value depending on horizontal or vertical plot orientation
        value = x if (median.get_xdata()[1]-median.get_xdata()[0]) == 0 else y
        text = ax.text(
                       x, 
                       y, 
                       f'{value:{precision}}', 
                       verticalalignment='center',
                       horizontalalignment='center', 
                       fontweight='bold', 
                       color='black',
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.6),
                      )
        # créer une bordure de couleur médiane autour du texte blanc pour le contraste 
        text.set_path_effects([
            path_effects.Stroke(linewidth=3, foreground=median.get_color()),
            path_effects.Normal(),
        ])   

# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Lecture des données</div></b>
<table>
        <tr>                                                                                   
             <th style='padding:15px;color:#030aa7;font-size:150%;text-align: left;font-weight: bold;font-family: Georgia, serif'>data.csv</th>
             <th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/breast_cancer_logo.png" width="128"></th>
        </tr>  
<table>   
<table>
        <tr>                                                                                   
             <th  style="text-align:left;background-color:#053061;color:white;"> </th>
             <th  style="text-align:left;background-color:#053061;color:white;">Colonne initiale </th>
             <th  style="text-align:left;background-color:#053061;color:white;">Description</th>
        </tr>    
    <tr>
        <th  style="text-align:left"> </th>                            
        <th  style="text-align:left;font-style: italic">ID number</th> 
        <th  style="text-align:left;font-style: italic">index</th>
    </tr>    
    <tr>
        <th  style="text-align:left">0 </th>                            
        <th  style="text-align:left;color:red;font-style: italic">Diagnosis</th> 
        <th  style="text-align:left;color:red;font-style: italic">M = malignant, B = benign</th>
    </tr>    
    <tr>
        <th  style="text-align:left">1 </th>                            
        <th  style="text-align:left">radius </th>                            
        <th  style="text-align:left"></th>
    </tr>    
    <tr>
        <th  style="text-align:left">2 </th>                            
        <th  style="text-align:left">texture </th>                          
        <th  style="text-align:left">standard deviation of gray-scale values</th>
    </tr>    
    <tr>
        <th  style="text-align:left">3 </th>                            
        <th  style="text-align:left">perimeter </th>                         
        <th  style="text-align:left"></th>
    </tr>    
    <tr>
        <th  style="text-align:left">4 </th>                            
        <th  style="text-align:left">area</th>                               
        <th  style="text-align:left"></th>
    </tr> 
    <tr>
        <th  style="text-align:left">5 </th>                            
        <th  style="text-align:left">smoothness </th>                            
        <th  style="text-align:left">local variation in radius lengths</th>
    </tr>  
    <tr>
        <th  style="text-align:left">6 </th>                            
        <th  style="text-align:left">compactness </th>                      
        <th  style="text-align:left">$\frac{perimeter^2}{area - 1.0}$</th>
    </tr>    
    <tr>
        <th  style="text-align:left">7 </th>                            
        <th  style="text-align:left">concavity </th>              
        <th  style="text-align:left">severity of concave portions of the contour</th>
    </tr>  
    <tr>
        <th  style="text-align:left">8 </th>                            
        <th  style="text-align:left">concave points </th>                            
        <th  style="text-align:left">number of concave portions of the contour</th>
    </tr>   
    <tr>
        <th  style="text-align:left">9 </th>                            
        <th  style="text-align:left">symmetry </th>                           
        <th  style="text-align:left"></th>
    </tr> 
    <tr>
        <th  style="text-align:left">10 </th>                            
        <th  style="text-align:left">fractal dimension </th>                 
        <th  style="text-align:left">"coastline approximation" - 1</th>
    </tr>    
</table>

<div style='padding:15px;color:#030aa7;font-size:100%;text-align: left;font-family: Georgia, serif'>La moyenne, l’écart type et la « pire valeur » ou la plus grande (moyenne des trois valeurs les plus grandes) de ces variables ont été calculées pour chaque image, ce qui a donné lieu à 30 variables.</div>

In [None]:
dictDiagnosis,dictRDiagnosis,dictLabels = {'M':1,'B':0},{1:'M',0:'B'},{1:'malignant',0:'benign'}
dictDiagnosis,dictRDiagnosis,dictLabels

In [None]:
donnees = pd.read_csv('../donnees/data.csv', index_col='id').drop(columns='Unnamed: 32')
donnees.sort_index(axis=1, inplace=True)
dictDiagnosis,dictRDiagnosis,dictLabels = {'M':1,'B':0},{1:'M',0:'B'},{1:'malignant',0:'benign'}

cible = 'diagnosis'

In [None]:
donnees.sample(5)

>> <div style='padding:15px;color:#030aa7;background-color:#d8dcd6;font-size:120%;text-align: left;font-family: Georgia, serif'>Il n'y a pas de valeurs non renseignées</div>

In [None]:
donnees.isna().sum().sum()

In [None]:
donnees.describe() #.style.format("{:0.2f}") 

# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Statistiques descriptives et analyse de données</div></b>

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Structure de l’échantillon des données </div></b>

In [None]:
donnees.groupby(['diagnosis']).area_mean.count()*100/donnees.shape[0]

In [None]:
radius,size=0.8,0.3
fig,ax = plt.subplots(ncols=1,figsize=(16,16), subplot_kw=dict(aspect="equal"))

affichage = donnees.groupby('diagnosis').area_mean.count().reset_index().rename(columns={'area_mean':'nombre'})
affichage['%'] = affichage.nombre * 100 / affichage.nombre.sum()
affichage

wedges, texts, autotexts =  ax.pie(
         affichage['nombre'],
         autopct=lambda pct: formatPct(pct, affichage.nombre.values),   # autopct='%1.2f%%', 
         labels=affichage.diagnosis.values,
         # shadow=True, 
         counterclock=False,
         startangle=90 ,
         colors = palette,
         # pctdistance=0.4, 
         labeldistance=1.1, 
         textprops=dict(color="#030aa7"),
         explode=[0.01 for _ in range(affichage.diagnosis.count())]
      );
plt.setp(autotexts, size=24, weight="bold",color="w")
plt.setp(texts, size=32, weight="bold");
# ax.set_title("Sexe d’Ormeaux",fontdict=dict(color="#030aa7", size=56));

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Distribution variables quantitatives</div></b>

In [None]:
palette = [ "#030aa7", "#e50000", "#d8863b", "#005f6a", "#6b7c85", "#751973", 
            "#0485d1", "#ff7855", "#fbeeac", "#0cb577", "#95a3a6", "#c071fe", 
            "#d1e5f0", "#fddbc7", "#ffffcb", "#12e193", "#d8dcd6", "#dfc5fe", 
          ]
sns.palplot(sns.color_palette(palette))

In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,axes = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for ax,colonne in zip(axes.ravel(),colonnes):
    affichageDistribution( donnees[colonne],"#6b7c85",ax)

In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,axes = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for ax,colonne in zip(axes.ravel(),colonnes):
    affichageDistribution(np.log(donnees[colonne]+1),"#6b7c85",ax)

In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()

colonnes  = donnees.filter(regex='(se)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()

colonnes  = donnees.filter(regex='(worst)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()


In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);

colonnes  = donnees.filter(regex='(se)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);

colonnes  = donnees.filter(regex='(worst)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));
for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);


## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Corrélation de Pearson</div></b>
<table>        
<tr>                                                                                   
     <th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/rmse.png" ></th>
     <th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/rse.png" ><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/correlation_pearson.png" ></th>
</tr> 
</table>
<img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/correlation_pearson_graphs.png" width="1024">

In [None]:
plt.figure(figsize=(56,56))
sns.set(font_scale=3)
plt.title('Correlation Pearson des variables', y=1.05, size=82)
sns.heatmap(donnees.drop(columns=cible).corr(), fmt= '.1f',linewidths=0.3,vmax=1.0, 
            square=True, cmap='coolwarm', linecolor='white', annot=True);

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Distribution de l’échantillon des données </div></b>

In [None]:
graph = sns.pairplot(
             donnees.filter(regex='(mean|diagnosis)$', axis=1),
             hue='diagnosis', 
             size=8, 
             aspect=1, 
             palette=palette, 
             plot_kws={"s": 1200,"alpha":0.6}, 
             markers=["o", "s", "^"],   
             corner=True, 
             # diag_kind="kde"
)
graph.map_upper(sns.kdeplot, levels=24, color=".2");
graph._legend.remove()
graph.add_legend(fontsize='xx-large', title_fontsize='xx-large');
# sauvegarderImage('Distribution de l’échantillon des données')

In [None]:
graph = sns.pairplot(
             donnees.filter(regex='(worst|diagnosis)$', axis=1),
             hue='diagnosis', 
             size=8, 
             aspect=1, 
             palette=palette, 
             plot_kws={"s": 1200,"alpha":0.6}, 
             markers=["o", "s", "^"],   
             corner=True, 
             # diag_kind="kde"
)
graph.map_upper(sns.kdeplot, levels=24, color=".2");
graph._legend.remove()
graph.add_legend(fontsize='xx-large', title_fontsize='xx-large');

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Centrage et réduction des données</div></b>
<table>
<tr>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/moyenne.png"></th>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/ecart_type.png"></th>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/centrage_reduction.png"></th>
</tr>
</table>

In [None]:
modelStd = StandardScaler()
donnees[donnees.drop(columns=cible).columns] = modelStd.fit_transform(donnees.drop(columns=cible))

In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()

colonnes  = donnees.filter(regex='(se)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()

colonnes  = donnees.filter(regex='(worst)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.distplot(donnees[colonne][donnees.diagnosis == 'M'],color=palette[0], label='M', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    sns.distplot(donnees[colonne][donnees.diagnosis == 'B'],color=palette[1], label='B', hist_kws=dict(alpha=0.4),bins=30, ax=ax[i])
    ax[i].legend()


In [None]:
colonnes  = donnees.filter(regex='(mean)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);

colonnes  = donnees.filter(regex='(se)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));

for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);

colonnes  = donnees.filter(regex='(worst)$', axis=1).columns
fig,ax = plt.subplots(1,len(colonnes),figsize=(len(colonnes)*12,12));
for i,colonne in enumerate(colonnes):
    sns.boxplot(y=colonne,data=donnees,hue='diagnosis', palette=palette, ax=ax[i]);

In [None]:
donnees.head()

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Analyse en composantes principales</div></b>

<table>
<tr>
<th><div style='padding:15px;color:#030aa7;font-size:120%;text-align: center;font-family: Georgia, serif'>Quelle est la meilleure représentation simplifiée ?</div></th>
<th><div style='padding:15px;color:#030aa7;font-size:120%;text-align: center;font-family: Georgia, serif'>Recherche du meilleur axe de projection</div></th>
</tr>    
<tr>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/acp_analogie_photo.png" width="512"></th>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/acp_projection_axe.png" width="512"></th>
</tr>
</table>    
<div style='padding:15px;color:#030aa7;font-size:120%;text-align: left;font-family: Georgia, serif'>A l'évidence, c'est la vue de profil. La raison est que l'image projetée du chameau dans ce plan est plus proche de l'image initiale dans le sens ou la variance des points servant à sa représentation est plus grande et donc restitue mieux la variance des points d'origine.</div>
<table>
<tr>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/axe_variance_max.png"></th>
<th><img src="https://raw.githubusercontent.com/rbizoi/MachineLearning/refs/heads/master/images/max_dist.png"></th>
</tr>
</table>

In [None]:
modelPCA = PCA(svd_solver='full')
modelPCA.fit(donnees.drop(columns='diagnosis'))

In [None]:
modelPCA.explained_variance_ratio_.cumsum()*100

In [None]:
modelPCA.explained_variance_*100

In [None]:
modelPCA.n_components_

In [None]:
inertie = pd.DataFrame(modelPCA.explained_variance_ratio_,columns=['Inertie']) 
inertie['Label'] = np.round(modelPCA.explained_variance_ratio_.cumsum()*100,2)
inertie['Dimension'] = range(1,len(modelPCA.explained_variance_)+1)
inertie['Inertie'] = np.round(inertie['Inertie']*100,2)
dim = modelPCA.n_components_
inertie = inertie[inertie['Dimension']<= dim]

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Choix du nombre des dimensions</div></b>
<div style='color:#030aa7;font-size:120%;text-align: left'>La qualité globale est mesurée par la part d'inertie expliquée. La valeur est choisie de sorte que cette part d'inertie expliquée soit supérieure à une valeur seuil fixée a priori par l'utilisateur.<br>C'est souvent le seul critère employé.<br><b>min 75%</b><br><br><b>95%</b></div>

In [None]:
sum(modelPCA.explained_variance_ratio_.cumsum()*100 <= 95) + 1

In [None]:
inertie[inertie.Dimension <= 10]

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Pourcentage d'inertie(variance) associée à chaque dimension</div></b>

In [None]:
fig, ax = plt.subplots(figsize=(24,12));

graph = sns.barplot(x="Dimension",y='Inertie', data=inertie, color="#030764", ax=ax)

for i,nom in enumerate(inertie.sort_values('Inertie', ascending=False).Inertie):
    graph.text(
                i ,
                10,
                f'{nom:0.2f}%',
                color='black',
                rotation='vertical',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.6),
                verticalalignment='center',
                horizontalalignment='center',
               )

ax.set_xlabel('Dimensions');
ax.set_ylabel('Inertie');
ax.set_title('Éboulis des valeurs propres', fontproperties=fm.FontProperties(size=32))
sauvegarderImage('Choix du nombre des dimensions01')

In [None]:
fig, ax = plt.subplots(figsize=(24,12));
graph = sns.barplot(x="Dimension",y='Inertie', data=inertie, color="#030764", ax=ax)

for i,(nom,valeur) in enumerate(zip(inertie.sort_values('Dimension').Label,inertie.sort_values('Dimension').Inertie)):
    # valeur = 0.1 if valeur - 0.1 < 0.1 else valeur - 0.1
    valeur = 10 if valeur - 10 < 10 else valeur - 10
    delta =  i 
    graph.text(
                delta ,
                valeur,
                f'{nom:0.2f}%',
                color='black',
                rotation='vertical',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.6),
                verticalalignment='center',
                horizontalalignment='center',
               )
    
graph = sns.lineplot( x=inertie.Dimension - 1,
                      y='Inertie',
                      data=inertie,
                      estimator = None, 
                      lw        = 2, 
                      ci        = None,
                      color     = "#030764",
                      ax    = ax);

sns.scatterplot(x=inertie.Dimension - 1,
                y     = 'Inertie',
                data  = inertie,
                s     = 200,
                # ci    = None, 
                color = "#e50000",
                ax    = ax);    
    
ax.set_xlabel('Dimension', fontproperties=font1)
ax.set_ylabel('Variance / Inertie', fontproperties=font1)
ax.set_title('Éboulis des valeurs propres', fontproperties=fm.FontProperties(size=32))

sauvegarderImage('Choix du nombre des dimensions02')
sns.set(font_scale=2)

In [None]:
fig, ax = plt.subplots(figsize=(24,12));

sns.lineplot(x         = inertie.Dimension-1,
             y         = 'Label',
             data      = inertie,
             estimator = None, 
             lw        = 2, 
             color     = "#030764",
             ax        = ax);
sns.scatterplot(x     = inertie.Dimension-1,
                y     = 'Label',
                data  = inertie,
                s     = 200,
                color = "#e50000",
                ax    = ax);

ax.set_xlabel('Dimensions');
ax.set_ylabel('Inertie');
ax.set_title('Somme cumulée des inerties', fontproperties=fm.FontProperties(size=32));
sauvegarderImage('Choix du nombre des dimensions03')

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Règle de Kaiser</div></b>

<div style='color:#030aa7;font-size:120%;text-align: left'>On ne conserve que les valeurs propres supérieures à leur moyenne car les autres représentent moins de variabilité qu'une seule variable initiale.</div>

In [None]:
print(f'{100 / modelPCA.n_components_}')

In [None]:
inertie[inertie.Inertie >= (100 / modelPCA.n_components_)]

In [None]:
n_components,_ = inertie[inertie.Inertie >= (100 / modelPCA.n_components_)].shape

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Transformation des données</div></b>

In [None]:
modelPCA = PCA(n_components=n_components).set_output(transform="pandas")
donneesACP = modelPCA.fit_transform((donnees.drop(columns='diagnosis')))

donneesACP.columns = [f'Dimension{i:02d}' for i in range(1,n_components+1)]
donneesACP['diagnosis'] = donnees['diagnosis'] 

<div style='padding:15px;color:#030aa7;font-size:120%;text-align: left;font-family: Georgia, serif'>Les nouvelles dimensions doivent être indépendantes deux à deux</div>

In [None]:
donneesACP.drop(columns='diagnosis').corr().style.format("{:0.1e}").background_gradient(cmap=plt.get_cmap('Blues'),axis=0)

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Représentation des individus</div></b>

In [None]:
layout = go.Layout({"showlegend": False})

fig = px.scatter_3d(donneesACP.reset_index(), 
                    x='Dimension01', 
                    y='Dimension02', 
                    z='Dimension03',
                    color='diagnosis',
                    size=(donneesACP.Dimension04 - donneesACP.Dimension04.min()),
                    # symbol='Dimension04',
                    # text='Nom',
                    width=1024,
                    height=1024,
                   )

fig.show()

In [None]:
sns.set(font_scale=3)
graph = sns.pairplot(
             donneesACP,
             hue='diagnosis', 
             size=12, 
             aspect=1, 
             palette=palette, 
             plot_kws={"s": 1200,"alpha":0.6}, 
             markers=["o", "s"],   
             corner=True, 
             diag_kind="kde")
graph.map_upper(sns.kdeplot, levels=24, color=".2");
graph._legend.remove()
graph.add_legend(fontsize='xx-large', title_fontsize='xx-large');

# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Classification hiérarchique ascendante</div></b>

In [None]:
Z = linkage(donneesACP.drop(columns='diagnosis'), 'ward')

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Affichage des arbres par défaut</div></b>

In [None]:
plt.figure(figsize=(25, 10))
plt.title('Classification Hiérarchique Ascendante')
plt.ylabel('Distance')
dendrogram(Z,
           leaf_rotation=90.,
           leaf_font_size=20.,
           labels = donneesACP.diagnosis.values
          );

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Affichage des arbres 10 clases</div></b>

In [None]:
plt.figure(figsize=(25, 10))
afficheDendrogram(
    Z,
    truncate_mode='lastp',
    p=50,                          # nombres des classes
    leaf_rotation=90.,
    leaf_font_size=20.,
    show_contracted=True,
    annotate_above=10,             # les annotations à partir de cette distance
    max_d=80,                      # la distance de découpage de l’arbre
    labels = donneesACP.diagnosis.values
);