## Mise en place de la base de données

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
#Chargement de la base de données

champi =pd.read_csv('agaricus-lepiota V2.csv',delimiter=';')

In [None]:
#Analyse univariée

C_cols = champi.columns.to_list()
for attribut in C_cols[:]:
    plt.figure()
    sns.countplot(x=attribut , data=champi)
    plt.show()
    print("% des differentes valeurs:")
    print(round((champi[attribut].value_counts()/champi.shape[0]),4)*100)

In [None]:
#Analyse bivariée des attributs par rapport a la classe

for attribut in C_cols[1:]: 
    plt.figure(figsize=(30,20))
    plt.subplot(3,3,3)
    sns.countplot(x=attribut ,hue='class', data=champi)
    plt.xlabel(attribut, fontsize=30)
    plt.legend(loc='upper right')
    plt.show()
    print(pd.pivot_table(champi, index=[attribut,"class"], aggfunc = {attribut:np.count_nonzero}))

## Mise en place en Python

In [None]:
#X contient toute les colonnes de la base de données sauf la 1ere, celle des classes
X=champi.drop('class',axis=1)

#Transformation des valeurs qualitatives en valeurs quantitatives sur X
column_names=list()
for names in X.columns:
    column_names.append(names)
X=pd.get_dummies(data=X)

#y ne contient que les classes
y=champi['class']

#Transformation des valeurs qualitatives en valeurs quantitatives sur y
y=pd.get_dummies(champi['class'], drop_first=True).to_numpy().squeeze()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## ACM & analyse des composantes

In [None]:
import prince
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, chi2

#Récupération de la base de données sans les classes
amcchampi=champi.drop('class',axis=1)
amcchampi.columns = column_names

#Application d'une ACM et affichage de la distance entre les variables
mca = prince.MCA(n_components=15, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42)
mca = mca.fit(amcchampi)

# Liste de l'inertie des différents composants
mca.explained_inertia_

ax = mca.plot_coordinates(X=amcchampi, ax=None, figsize=(50, 50), x_component=0, y_component=1, show_row_points=False, row_points_size=10,
                          show_row_labels=False, show_column_points=True, column_points_size=30, show_column_labels=True, legend_n_cols=1)
ax.get_figure().savefig('mca.svg')

In [None]:
#Récupération de la base de données pour l'analyse des features
X1 = db=champi.drop('class',axis=1)
y1 = champi['class']

#Préparation des variables
oe = OrdinalEncoder()
oe.fit(X1)
X_enc = oe.transform(X1)

#Préparation des classes
le = LabelEncoder()
le.fit(y1)
y_enc = le.transform(y1)

#Affichage du score de chaque variable
sf = SelectKBest(chi2, k='all')
sf_fit1 = sf.fit(X_enc, y_enc)
for i in range(len(sf_fit1.scores_)):
    print(' %s: %f' % (X1.columns[i], sf_fit1.scores_[i]))
    
# Affichage des variables par score
datset1 = pd.DataFrame()
datset1['feature'] = X1.columns[ range(len(sf_fit1.scores_))]
datset1['scores'] = sf_fit1.scores_
datset1 = datset1.sort_values(by='scores', ascending=True)
sns.barplot(datset1['scores'], datset1['feature'], color='green')
sns.set_style('whitegrid')
plt.ylabel('Categorical feature', fontsize=18)
plt.xlabel('Score', fontsize=18)
plt.show()

### Partie Arbre de données

In [None]:
#Partie arbre de décision
from sklearn import tree
from math import *
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [None]:
#Determination des meilleurs parametres pour l'arbre
pgrid = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9],
      "min_samples_split": [2, 3, 5, 10, 15, 20]}
grid_search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=pgrid, cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
arbre = tree.DecisionTreeClassifier(min_samples_leaf=grid_search.best_estimator_.max_depth, min_samples_split=grid_search.best_estimator_.min_samples_split)
arbre.fit(X_train, y_train)
arbre.score(X_test, y_test)

In [None]:
#Affichage de l'arbre
tree.plot_tree(arbre, filled=True)

# On exporte le graphe dans le fichier champignon.dot
with open("arbre.dot", 'w') as f:
    f = tree.export_graphviz(arbre, out_file=f, filled=True)

#Enregistrement de l'arbre pour une meilleur visualisation
plt.savefig('arbre.png')

## Bagging, Forêts aléatoires et Boosting

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
#Determination des meilleurs parametres pour le Bagging
pgrid = {"max_samples": [0.1, 0.2, 0.4, 0.6, 0.8],
        "max_features": [0.2, 0.4, 0.6, 0.8, 1]}
grid_search = GridSearchCV(BaggingClassifier(tree.DecisionTreeClassifier()), param_grid=pgrid, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
#Test avec le Bagging
bagging = BaggingClassifier(tree.DecisionTreeClassifier(),
                            max_samples=grid_search.best_estimator_.max_samples,
                            max_features=grid_search.best_estimator_.max_features,
                            n_estimators=200)
bagging.fit(X_train, y_train)

In [None]:
#Test avec les Forêts aléatoires
random = RandomForestClassifier(n_estimators=200)
random.fit(X_train, y_train)

In [None]:
extra = ExtraTreesClassifier(n_estimators=200)
extra.fit(X_train, y_train)

In [None]:
#Test avec le Boosting
boost = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=5),
                           n_estimators=200, learning_rate=2)
boost.fit(X_train, y_train)

In [None]:
print("Bagging (200 arbres) : " + str(bagging.score(X_test,y_test)))
print("Forêt aléatoire (200 arbres) : " + str(random.score(X_test,y_test)))
print("Extra Trees (200 arbres) : " + str(extra.score(X_test,y_test)))
print("AdaBoost (200 arbres) : " + str(boost.score(X_test,y_test)))

## SVM

In [None]:
from sklearn import svm, datasets

#Determination des meilleurs parametres pour le SVM linéaire
pgrid = {"C": [1, 8, 10, 15]}
grid_search = GridSearchCV(svm.LinearSVC(), param_grid=pgrid, cv=5)
grid_search.fit(X_train, y_train)
print("Meilleur C: " + str(grid_search.best_estimator_.C))

In [None]:
lin_svc = svm.LinearSVC(C=grid_search.best_estimator_.C)
lin_svc.fit(X_train, y_train)
lin_svc.score(X_test, y_test)

In [None]:
#Determination des meilleurs parametres pour le SVM
pgrid = {"C": [1, 8, 10, 15],
        "gamma": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}
grid_search = GridSearchCV(svm.SVC(), param_grid=pgrid, cv=5)
grid_search.fit(X_train, y_train)
print("Meilleur C: " + str(grid_search.best_estimator_.C))
print("Meilleur gamma: " + str(grid_search.best_estimator_.gamma))

In [None]:
clf = svm.SVC(C=grid_search.best_estimator_.C, kernel='rbf', gamma=grid_search.best_estimator_.gamma)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## Partie réseau de Neurone

In [None]:
#Partie Réseau de neurones profond
import tensorflow
from tensorflow import keras
from keras import layers, Sequential
from keras.layers import Dense

In [None]:
#Passage de y a deux dimensions
y=pd.get_dummies(data=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Initialisation du reseau de neurones profond
analyse_champi = Sequential()

#Ajout des couches
analyse_champi.add(Dense(3, activation = 'relu',  input_dim=116, name='couche_1'))
analyse_champi.add(Dense(2 ,activation='softmax', name='couche_final'))

#Compilation
analyse_champi.compile(loss='binary_crossentropy',optimizer=keras.optimizers.SGD(0.5),metrics=['accuracy'])

#Fit (possibilité de modification du verbose pour afficher ou non les différentes epochs
historique = analyse_champi.fit(X_train, y_train, batch_size = 80, epochs = 25, validation_data=(X_test,y_test), verbose=1)
print('-------------------------------')

#Affichage du score et d'un graphique montrant l'evolution des différentes valeurs
pd.DataFrame(historique.history).plot()
scores = analyse_champi.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (analyse_champi.metrics_names[0], scores[0]*100))
print("%s: %.2f%%" % (analyse_champi.metrics_names[1], scores[1]*100))