# Devoir 1 : Étape 1
## Distances entre les classes 

Olivier Lefebvre - 17079778

Simon Giard-Leroux - XXXXXX

### Code préliminaire d'accès aux données et d'import

In [5]:
import itertools
import numpy as np
import pandas as pd
from prettytable import PrettyTable  # Pour faire les tables d'informations
from scipy.spatial import distance

In [6]:
# Loading data from the csv file
iris_data = pd.read_csv('data/iris.csv')

variables_iris = np.array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

X = iris_data[variables_iris]
Y = iris_data[['species']]

X_setosa = X.loc[Y['species'] == 'setosa']
X_versicolor = X.loc[Y['species'] == 'versicolor']
X_virginica = X.loc[Y['species'] == 'virginica']


## Définition des fonctions de distance

In [93]:
# On définit la distance euclidienne entre deux points
def dist_euclid(p, q):
    return np.linalg.norm(p-q)

# On définit la distance de mahalanobis
def dist_mahalan(p, q, inv_mat):
    return distance.mahalanobis(p, q, inv_mat)

In [137]:
# Fonction de distance inter classe
def intra_distance(X_classe, dist):
    mean_classe = X_classe.mean().values
    
    max_distance = 0
    for data_point in X_classe.values:
        if dist == 'euc':
            distance_iter = dist_euclid(mean_classe, data_point)
        elif dist =='mah':
            
            cov = sum([ np.outer(data-mean_classe, data-mean_classe) for data in X_classe.values])/(len(X_classe)-1)
            inv_cov = np.linalg.inv(cov)
            
            distance_iter = dist_mahalan(mean_classe, data_point, inv_cov)
        else:
            return -1
            
        max_distance = max(max_distance, distance_iter)
    return round(max_distance, 5)

def inter_distance(X_classe1, mean_classe2, dist, X_2_classes=None):
    min_distance = np.inf
    
    for data_point in X_classe1.values:
        if dist == 'euc':
            distance_iter = dist_euclid(mean_classe2, data_point)
        elif dist =='mah':
            mean_2_classes = X_2_classes.mean().values
            
            cov = sum([ np.outer(data-mean_2_classes, data-mean_2_classes) for data in X_2_classes.values])/(len(X_2_classes)-1)
            inv_cov = np.linalg.inv(cov)
            
            distance_iter = dist_mahalan(mean_classe2, data_point, inv_cov)
        else:
            return -1
            
        min_distance = min(min_distance, distance_iter)
    return round(min_distance, 5)

## Template de base pour les tableaux de distances

In [9]:
columns = ["Distance directionnelle", "setosa", "setosa ", "versicolor", "versicolor ", "virginica", "virginica "]
  
myTable = PrettyTable(columns)
myTable.title = "Template d'informations dans le tableau"
myTable.add_row(["Distance", "euclidean", "Mahalanobis", "euclidean", "Mahalanobis", "euclidean", "Mahalanobis"]) 
myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
myTable.add_row(["setosa", "intra-classe", "intra-classe", "inter-classe", "inter-classe", "inter-classe", "inter-classe"]) 
myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
myTable.add_row(["versicolor", "inter-classe", "inter-classe", "intra-classe", "intra-classe", "inter-classe", "inter-classe"]) 
myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
myTable.add_row(["virginica", "inter-classe", "inter-classe", "inter-classe", "inter-classe", "intra-classe", "intra-classe"])

print(myTable)

+---------------------------------------------------------------------------------------------------------------------+
|                                       Template d'informations dans le tableau                                       |
+---------------------------+--------------+--------------+--------------+--------------+--------------+--------------+
|  Distance directionnelle  |    setosa    |   setosa     |  versicolor  | versicolor   |  virginica   |  virginica   |
+---------------------------+--------------+--------------+--------------+--------------+--------------+--------------+
|          Distance         |  euclidean   | Mahalanobis  |  euclidean   | Mahalanobis  |  euclidean   | Mahalanobis  |
| _________________________ |  __________  |  __________  |  __________  |  __________  |  __________  |  __________  |
|           setosa          | intra-classe | intra-classe | inter-classe | inter-classe | inter-classe | inter-classe |
| _________________________ |  _________

## Génération des tableaux de distance

In [139]:
# On définit l'entête des tables
columns = ["Distance directionnelle", "setosa", "setosa ", "versicolor", "versicolor ", "virginica", "virginica "]

# Pour tous les regroupements de variables
for L in range(1, len(variables_iris)+1):
    for subset in itertools.combinations(variables_iris, L):
        # On calcule les mesures intra-classes selon la distance euclidienne
        euc_intra_setosa = intra_distance(X_classe=X_setosa[list(subset)], dist='euc')
        euc_intra_versicolor = intra_distance(X_classe=X_versicolor[list(subset)], dist='euc')
        euc_intra_virginica = intra_distance(X_classe=X_virginica[list(subset)], dist='euc')

        # On calcule les mesures intra-classes selon la distance de mahalanobis
        mah_intra_setosa = intra_distance(X_classe=X_setosa[list(subset)], dist='mah')
        mah_intra_versicolor = intra_distance(X_classe=X_versicolor[list(subset)], dist='mah')
        mah_intra_virginica = intra_distance(X_classe=X_virginica[list(subset)], dist='mah')

        
        # On calcule les mesures inter-classes directionnelles selon la distance euclidienne
        euc_inter_set_vers = inter_distance(X_classe1=X_setosa[list(subset)],
                                            mean_classe2=X_versicolor[list(subset)].mean().values,
                                            dist='euc')
        euc_inter_set_virg = inter_distance(X_classe1=X_setosa[list(subset)],
                                            mean_classe2=X_virginica[list(subset)].mean().values,
                                            dist='euc')
        
        euc_inter_vers_set = inter_distance(X_classe1=X_versicolor[list(subset)],
                                            mean_classe2=X_setosa[list(subset)].mean().values,
                                            dist='euc')
        euc_inter_vers_virg = inter_distance(X_classe1=X_versicolor[list(subset)],
                                            mean_classe2=X_virginica[list(subset)].mean().values,
                                            dist='euc')
        
        euc_inter_virg_set = inter_distance(X_classe1=X_virginica[list(subset)],
                                            mean_classe2=X_setosa[list(subset)].mean().values,
                                            dist='euc')
        euc_inter_virg_vers = inter_distance(X_classe1=X_virginica[list(subset)],
                                            mean_classe2=X_versicolor[list(subset)].mean().values,
                                            dist='euc')
        
        # On calcule les mesures inter-classes selon la distance de mahalanobis
        mah_inter_set_vers = inter_distance(X_classe1=X_setosa,
                                            X_2_classes=pd.concat([X_setosa, X_versicolor]),
                                            mean_classe2=X_versicolor.mean().values,
                                            dist='mah')
        mah_inter_set_virg = inter_distance(X_classe1=X_setosa,
                                            X_2_classes=pd.concat([X_setosa, X_virginica]),
                                            mean_classe2=X_virginica.mean().values,
                                            dist='mah')
        
        mah_inter_vers_set = inter_distance(X_classe1=X_versicolor,
                                            X_2_classes=pd.concat([X_versicolor, X_setosa]),
                                            mean_classe2=X_setosa.mean().values,
                                            dist='mah')
        mah_inter_vers_virg = inter_distance(X_classe1=X_versicolor,
                                            X_2_classes=pd.concat([X_versicolor, X_virginica]),
                                            mean_classe2=X_virginica.mean().values,
                                            dist='mah')
        
        mah_inter_virg_set = inter_distance(X_classe1=X_virginica,
                                            X_2_classes=pd.concat([X_virginica, X_setosa]),
                                            mean_classe2=X_setosa.mean().values,
                                            dist='mah')
        mah_inter_virg_vers = inter_distance(X_classe1=X_virginica,
                                            X_2_classes=pd.concat([X_virginica, X_versicolor]),
                                            mean_classe2=X_versicolor.mean().values,
                                            dist='mah')
        
        myTable = PrettyTable(columns)
        myTable.title = f"Informations de distance pour les variables {subset}"
        myTable.add_row(["Distance", "euclidean", "Mahalanobis", "euclidean", "Mahalanobis", "euclidean", "Mahalanobis"]) 
        myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
        myTable.add_row(["setosa", euc_intra_setosa, mah_intra_setosa, euc_inter_set_vers, mah_inter_set_vers, euc_inter_set_virg, mah_inter_set_virg]) 
        myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
        myTable.add_row(["versicolor", euc_inter_vers_set, mah_inter_vers_set, euc_intra_versicolor, mah_intra_versicolor, euc_inter_vers_virg, mah_inter_vers_virg]) 
        myTable.add_row(["_"*25, "_"*10, "_"*10,"_"*10, "_"*10, "_"*10, "_"*10]) 
        myTable.add_row(["virginica", euc_inter_virg_set, mah_inter_virg_set, euc_inter_virg_vers, mah_inter_virg_vers, euc_intra_virginica, mah_intra_virginica])

        print(myTable)
        print("\n")


+------------------------------------------------------------------------------------------------------------+
|                       Informations de distance pour les variables ('sepal_length',)                        |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|  Distance directionnelle  |   setosa   |   setosa    | versicolor | versicolor  | virginica  |  virginica  |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|          Distance         | euclidean  | Mahalanobis | euclidean  | Mahalanobis | euclidean  | Mahalanobis |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|           setosa          |   0.794    |   2.25255   |   0.136    |   2.00285   |   0.788    |   1.90533   |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|

+------------------------------------------------------------------------------------------------------------+
|                Informations de distance pour les variables ('sepal_length', 'petal_width')                 |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|  Distance directionnelle  |   setosa   |   setosa    | versicolor | versicolor  | virginica  |  virginica  |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|          Distance         | euclidean  | Mahalanobis | euclidean  | Mahalanobis | euclidean  | Mahalanobis |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|           setosa          |  0.79533   |   3.50201   |   0.9556   |   2.00285   |  1.85268   |   1.90533   |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|

+------------------------------------------------------------------------------------------------------------+
|        Informations de distance pour les variables ('sepal_length', 'petal_length', 'petal_width')         |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|  Distance directionnelle  |   setosa   |   setosa    | versicolor | versicolor  | virginica  |  virginica  |
+---------------------------+------------+-------------+------------+-------------+------------+-------------+
|          Distance         | euclidean  | Mahalanobis | euclidean  | Mahalanobis | euclidean  | Mahalanobis |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|           setosa          |  0.83737   |   3.50523   |  2.66945   |   2.00285   |  4.26557   |   1.90533   |
| _________________________ | __________ |  __________ | __________ |  __________ | __________ |  __________ |
|