# 4 Méthodes de classification


## 4-1) Premiere Exploration

In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [1]:
import pandas as pd
donnees_kickstarter = pd.read_csv('donnees_kickstarter/ks-projects.csv',delimiter=',',encoding='latin1')
donnees_kickstarter.head()

Unnamed: 0,id,name,category,subcategory,country,sex,age,start_date,end_date,currency,goal,pledged,backers,state
0,149450711,Abilities Rock,Art,Art,US,male,26,2015-11-17 19:37:50,2015-12-17 19:37:50,USD,10000.0,0.0,0,failed
1,356418410,De'VIA Elders Reunion,Art,Art,US,female,26,2016-03-21 22:18:52,2016-04-23 04:00:00,USD,7500.0,958.0,15,failed
2,412253775,Help C.B. Farr make a documentary about becomi...,Art,Art,US,female,28,2013-06-05 08:30:31,2013-07-05 08:30:31,USD,20000.0,0.0,0,failed
3,477821027,The London Aesthetifest,Art,Art,US,female,27,2014-06-24 19:49:35,2014-07-15 04:05:00,USD,7800.0,1000.0,15,failed
4,350951514,Hobo Nickels across the nation,Art,Art,US,male,22,2015-10-02 16:00:42,2015-11-01 16:00:42,USD,2621.0,392.0,6,failed


In [2]:
donnees_kickstarter = donnees_kickstarter.drop(['id','pledged','backers','name','start_date','end_date'],axis=1)
donnees_kickstarter.head()

Unnamed: 0,category,subcategory,country,sex,age,currency,goal,state
0,Art,Art,US,male,26,USD,10000.0,failed
1,Art,Art,US,female,26,USD,7500.0,failed
2,Art,Art,US,female,28,USD,20000.0,failed
3,Art,Art,US,female,27,USD,7800.0,failed
4,Art,Art,US,male,22,USD,2621.0,failed


### Découpage

In [3]:
X = donnees_kickstarter.drop('state',axis=1)
y = donnees_kickstarter['state']

In [4]:
y.unique()

array(['failed', 'canceled', 'successful', 'live', 'suspended',
       'undefined'], dtype=object)

In [8]:
X['sex'].unique()

array(['male', 'female', nan], dtype=object)

In [15]:
X['currency'].unique()

array(['USD', 'CAD', 'GBP', 'AUD', 'EUR', 'DKK', 'SEK', 'NOK', 'NZD',
       'CHF'], dtype=object)

In [5]:
X['country'].unique()

array(['US', 'CA', 'GB', 'AU', 'DE', 'NL', 'ES', 'DK', 'IE', 'IT', 'SE',
       'FR', 'NO', 'NZ', 'AT', 'CH', 'BE', nan, 'LU'], dtype=object)

In [6]:
X['category'].unique()

array(['Art', 'Comics', 'Dance', 'Design', 'Fashion', 'Food',
       'Film & Video', 'Games', 'Journalism', 'Music', 'Photography',
       'Technology', 'Theater', 'Publishing', 'Crafts'], dtype=object)

In [14]:
age = X['age'].unique()
age.sort()
age

array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70], dtype=int64)

In [7]:
X['subcategory'].unique()

array(['Art', 'Comics', 'Dance', 'Design', 'Fashion', 'Food',
       'Film & Video', 'Games', 'Journalism', 'Music', 'Photography',
       'Technology', 'Theater', 'Publishing', 'Conceptual Art',
       'Digital Art', 'Illustration', 'Painting', 'Performance Art',
       'Sculpture', 'Crafts', 'Graphic Design', 'Product Design',
       'Animation', 'Documentary', 'Narrative Film', 'Shorts',
       'Webseries', 'Tabletop Games', 'Video Games', 'Classical Music',
       'Country & Folk', 'Electronic Music', 'Hip-Hop', 'Indie Rock',
       'Jazz', 'Pop', 'Rock', 'World Music', 'Art Books',
       "Children's Books", 'Fiction', 'Nonfiction', 'Periodicals',
       'Poetry', 'Software', 'Hardware', 'Public Art', 'Mixed Media',
       'Radio & Podcasts', 'Metal', 'Anthologies', 'Comic Books',
       'Events', 'Graphic Novels', 'Webcomics', 'Performances',
       'Residencies', 'Spaces', 'Workshops', 'Architecture',
       'Civic Design', 'Interactive Design', 'Typography', 'Accessories',
    

## 4-2) Pipeline

In [25]:
def preprocess_data(data):
    # Supprimer les colonnes non nécessaires
    data = data.drop(['id','pledged','backers','name','start_date','end_date'], axis=1)
    
    # Séparer les features (X) de la cible (y)
    X = data.drop('state', axis=1)
    y = data['state']
    
    # Liste des attributs numériques et catégoriels
    numeric_features = ['age', 'goal']
    categorical_features = ['category','subcategory', 'country', 'sex', 'currency']
    
    # Prétraitement des attributs
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Appliquer le prétraitement
    X_preprocessed = preprocessor.fit_transform(X)

    return X_preprocessed, y

In [26]:
def train_models(X_train, y_train):
    # Modèles et pipelines
    svm_classifier = Pipeline(steps=[
        ('classifier', SVC())
    ])

    knn_classifier = Pipeline(steps=[
        ('classifier', KNeighborsClassifier())
    ])

    nb_classifier = Pipeline(steps=[
        ('classifier', GaussianNB())
    ])

    rf_classifier = Pipeline(steps=[
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # Grilles d'hyperparamètres pour la recherche
    svm_param_grid = {'classifier__C': [0.1, 1, 10],
                      'classifier__kernel': ['linear', 'rbf']}

    knn_param_grid = {'classifier__n_neighbors': [3, 5, 7]}

    rf_param_grid = {'classifier__n_estimators': [50, 100, 150],
                     'classifier__max_depth': [None, 10, 20]}

    # Utilisation de GridSearchCV pour la recherche des hyperparamètres
    print("Recherche des hyperparamètres (SVM)...")
    svm_grid = GridSearchCV(svm_classifier, svm_param_grid, cv=5)
    svm_grid.fit(X_train, y_train)
    print("Recherche terminée.")

    print("Recherche des hyperparamètres (k-NN)...")
    knn_grid = GridSearchCV(knn_classifier, knn_param_grid, cv=5)
    knn_grid.fit(X_train, y_train)
    print("Recherche terminée.")

    print("Recherche des hyperparamètres (Random Forest)...")
    rf_grid = GridSearchCV(rf_classifier, rf_param_grid, cv=5)
    rf_grid.fit(X_train, y_train)
    print("Recherche terminée.")

    return svm_grid.best_estimator_, knn_grid.best_estimator_, nb_classifier.fit(X_train, y_train), rf_grid.best_estimator_

In [27]:
def evaluate_models(classifiers, X_valid, y_valid):
    for name, classifier in classifiers.items():
        y_pred = classifier.predict(X_valid)
        print(f"Rapport de classification ({name}):")
        print(classification_report(y_valid, y_pred))

In [None]:
def main():
    # Charger les données
    donnees_kickstarter = pd.read_csv('donnees_kickstarter/ks-projects.csv',delimiter=',',encoding='latin1')

    # Appliquer le prétraitement une seule fois
    X_preprocessed, y = preprocess_data(donnees_kickstarter)

    # Diviser les données en ensembles d'entraînement, de validation et de test
    X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    # Entraîner les modèles
    classifiers = train_models(X_train, y_train)
    
    # Évaluer les modèles sur les données de validation
    evaluate_models({'SVM': classifiers[0], 'k-NN': classifiers[1], 'Naive Bayes': classifiers[2], 'Random Forest': classifiers[3]}, X_valid, y_valid)

    # Évaluer les modèles sur les données de test
    print("Évaluation finale sur les données de test:")
    evaluate_models({'SVM': classifiers[0], 'k-NN': classifiers[1], 'Naive Bayes': classifiers[2], 'Random Forest': classifiers[3]}, X_test, y_test)

if __name__ == "__main__":
    main()

Recherche des hyperparamètres (SVM)...
