## <span style="color:rgb(25,25,112)">Import des bibliotheques</span>


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import seaborn as sns
%matplotlib inline

## <span style="color:rgb(25,25,112)">Import du fichier CSV</span>


In [2]:
df = pd.read_csv('music_genre.csv', header=0)

**Premières 5 lignes de notre DataFrame**

In [3]:
df.head()

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,4-Apr,0.27,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic


**Info sur les colonnes**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50005 entries, 0 to 50004
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  float64
 1   artist_name       50000 non-null  object 
 2   track_name        50000 non-null  object 
 3   popularity        50000 non-null  float64
 4   acousticness      50000 non-null  float64
 5   danceability      50000 non-null  float64
 6   duration_ms       50000 non-null  float64
 7   energy            50000 non-null  float64
 8   instrumentalness  50000 non-null  float64
 9   key               50000 non-null  object 
 10  liveness          50000 non-null  float64
 11  loudness          50000 non-null  float64
 12  mode              50000 non-null  object 
 13  speechiness       50000 non-null  float64
 14  tempo             50000 non-null  object 
 15  obtained_date     50000 non-null  object 
 16  valence           50000 non-null  float6

**Obtenir des statistiques de base sur les données <span style="color:red">numériques</span> :**

In [5]:
df.describe()

Unnamed: 0,instance_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,valence
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,55888.39636,44.22042,0.306383,0.558241,221252.6,0.599755,0.181601,0.193896,-9.133761,0.093586,0.456264
std,20725.256253,15.542008,0.34134,0.178632,128672.0,0.264559,0.325409,0.161637,6.16299,0.101373,0.247119
min,20002.0,0.0,0.0,0.0596,-1.0,0.000792,0.0,0.00967,-47.046,0.0223,0.0
25%,37973.5,34.0,0.02,0.442,174800.0,0.433,0.0,0.0969,-10.86,0.0361,0.257
50%,55913.5,45.0,0.144,0.568,219281.0,0.643,0.000158,0.126,-7.2765,0.0489,0.448
75%,73863.25,56.0,0.552,0.687,268612.2,0.815,0.155,0.244,-5.173,0.098525,0.648
max,91759.0,99.0,0.996,0.986,4830606.0,0.999,0.996,1.0,3.744,0.942,0.992


**Obtenir des statistiques de base sur les données <span style="color:red">qualitatives</span> :**

In [6]:
df.describe(include = object)

Unnamed: 0,artist_name,track_name,key,mode,tempo,obtained_date,music_genre
count,50000,50000,50000,50000,50000,50000,50000
unique,6863,41699,12,2,29394,5,10
top,empty_field,Home,G,Major,?,4-Apr,Electronic
freq,2489,16,5727,32099,4980,44748,5000


> Les valeurs `empty_field` et `?` dans le dataset sont traitées comme des chaînes valides au lieu de valeurs manquantes, nécessitant leur conversion en `NaN` pour une gestion correcte.

## <span style="color:rgb(25,25,112)">Préparation de données</span>


**Compte du nombre de valeurs manquantes par colonne (<span style="color:green">avant</span> remplacement de <span style="color:red">empty_field</span> et <span style="color:red">?</span>)**

In [7]:
df.isnull().sum()

instance_id         5
artist_name         5
track_name          5
popularity          5
acousticness        5
danceability        5
duration_ms         5
energy              5
instrumentalness    5
key                 5
liveness            5
loudness            5
mode                5
speechiness         5
tempo               5
obtained_date       5
valence             5
music_genre         5
dtype: int64

**Remplacement de <span style="color:red">empty_field</span> et <span style="color:red">?</span>**

In [8]:
df.replace(['empty_field', '?'], np.nan, inplace=True)

**Compte du nombre de valeurs manquantes par colonne (<span style="color:green">après</span> remplacement de <span style="color:red">empty_field</span> et <span style="color:red">?</span>)**

In [9]:
df.isnull().sum()

instance_id            5
artist_name         2494
track_name             5
popularity             5
acousticness           5
danceability           5
duration_ms            5
energy                 5
instrumentalness       5
key                    5
liveness               5
loudness               5
mode                   5
speechiness            5
tempo               4985
obtained_date          5
valence                5
music_genre            5
dtype: int64

**Remplacement sur les données <span style="color:red">numériques</span> :**

In [10]:
# Création d'un impute pour remplacer les valeurs manquantes des colonnes numériques
imputer_num = SimpleImputer(strategy="median")

# Sélection
numerical_cols = df.select_dtypes(include=['float64']).columns

# Application de l'impute sur les colonnes numériques
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])


**Remplacement sur les données <span style="color:red">qualitatives/catégoriels</span> :**

In [11]:
# Création d'un impute pour les attributs catégoriels
imputer_cat = SimpleImputer(strategy="most_frequent")

# Sélection
categorical_cols = df.select_dtypes(include=['object']).columns

# Application de l'impute sur les colonnes catégorielles
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])


**Supprimer `music_genre`, pour que le modèle ne soit pas influencé par cette colonne**

In [12]:
# Supprimer la colonne 'music_genre' et la stocker dans une variable séparée
labels = df['music_genre'].copy()
df = df.drop('music_genre', axis=1)


**Suppression des Colonnes Inutiles**


In [13]:
df.drop(['artist_name', 'track_name', 'obtained_date', 'instance_id'], axis=1, inplace=True)

**Convertir tempo en Type Numérique**

In [14]:
df['tempo'] = pd.to_numeric(df['tempo'], errors='coerce')

>La colonne `tempo` contient des valeurs numériques, mais elle est stockée en tant que type objet,

**Encodage des Données Catégorielles**

In [15]:
df = pd.get_dummies(df, drop_first=True)
df.head()


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#,mode_Minor
0,27.0,0.00468,0.652,-1.0,0.941,0.792,0.115,-5.201,0.0748,100.889,...,0,0,0,0,0,0,0,0,0,1
1,31.0,0.0127,0.622,218293.0,0.89,0.95,0.124,-7.043,0.03,115.002,...,0,0,1,0,0,0,0,0,0,1
2,28.0,0.00306,0.62,215613.0,0.755,0.0118,0.534,-4.617,0.0345,127.994,...,0,0,0,0,0,0,0,0,1,0
3,34.0,0.0254,0.774,166875.0,0.7,0.00253,0.157,-4.498,0.239,128.014,...,0,1,0,0,0,0,0,0,0,0
4,32.0,0.00465,0.638,222369.0,0.587,0.909,0.157,-6.266,0.0413,145.036,...,0,0,0,0,0,0,1,0,0,0


> l'encodage One-Hot créera une nouvelle colonne pour chaque catégorie unique des données catégorielles, avec des 1 et des 0 indiquant la présence d'une catégorie pour chaque observation 

## <span style="color:rgb(25,25,112)">Classification Hiérarchique</span>

**Création d'un sous-ensemble de données**