# Projet : Conversion Rate Challenge 🏆
# Approche 1 : Traitement ensemble des pays
# Modèles : Logistic Regression, SVM, Decision Tree Classifier, Random Forest Classifier


## Import des modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn import svm

## Chargement des données

In [None]:
data = pd.read_csv('./data/conversion_data_train.csv')
print('Set with labels (our train) :', data.shape)

Set with labels (our train) : (284580, 6)


## Suppression outlers

In [13]:
# suppression lignes avec age >= XX
data = data[data["age"] < 100]

## Entrainement des Modèles : Logistic Regression, SVM, Decision Tree Classifier, Random Forest Classifier

In [14]:
def best_prob_f1score(y_prob, y_real, seuil_step) :
    best_seuil = 0.5
    best_y_pred = np.array([0 if p[0] > best_seuil else 1 for p in y_prob])
    best_f1_score = f1_score(y_real, best_y_pred)

    
    for seuil in np.arange(0, 1, seuil_step) :
        y_pred = np.array([0 if p[0] > seuil else 1 for p in y_prob])
        seuil_f1_score = f1_score(y_real, y_pred)
        if seuil_f1_score > best_f1_score :
            best_seuil = seuil
            best_f1_score = seuil_f1_score
            best_y_pred = y_pred

    return (best_seuil, best_f1_score, best_y_pred)

In [19]:
# Les modèles testés
models = (("V01A", "Logistic Regression"),
          ("V02A", "SVM"),
          ("V03A", "Decision Tree Classifier"),
          ("V04A", "Random Forest Classifier")
        )

# initialisation dataframe pour tracer le résultat des différents modèles
df_result = pd.DataFrame(columns = ["Version", "Modèle", "f1Score_train", "f1Score_test"])


for model in models :
    print("\n*** Modèle : " + model[0] + " : " + model[1] + "\n")

    num_features = ["age", "total_pages_visited"]
    cat_features = ["source", "new_user", "country"]
    features_list = num_features + cat_features
    target_variable = 'converted'

    dataset = data
    X = dataset.loc[:, features_list]
    Y = dataset.loc[:, target_variable]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y ,random_state=27)

    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler()),])
    categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop="first")),])
    preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, num_features),
            ("cat", categorical_transformer, cat_features),
        ]
    )

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    # Logistic Regression
    if model[0] ==  "V01A" :
        classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
        classifier.fit(X_train, Y_train)

    # SVM
    if model[0] ==  "V02A" :
        classifier = svm.SVC(kernel="poly", probability=True, C=10, random_state=42)
        classifier.fit(X_train, Y_train)

    # Decision Tree Classifier
    if model[0] ==  "V03A" :
        # model = DecisionTreeClassifier()
        params = {
            'max_depth':[12, 14, 16],
            'min_samples_split':[14,18,22],
            'min_samples_leaf':[4,6,8]
        }
        gridsearch = GridSearchCV(DecisionTreeClassifier(), param_grid = params, scoring='f1', cv=5, verbose=1)
        gridsearch.fit(X_train, Y_train)
        classifier = gridsearch.best_estimator_


    # Random Forest Classifier
    if model[0] ==  "V04A" :
        # model = RandomForestClassifier()
        params={
            'max_depth':[9],
            'min_samples_split':[9],
            'n_estimators':[40]
        }
        gridsearch = GridSearchCV(RandomForestClassifier(), param_grid = params, scoring='f1', cv=5, verbose=1)
        gridsearch.fit(X_train, Y_train)
        classifier = gridsearch.best_estimator_


    # Predictions on training set
    Y_train_pred = classifier.predict(X_train)
    Y_train_prob = classifier.predict_proba(X_train)

    # Predictions on test set
    Y_test_pred = classifier.predict(X_test)
    Y_test_prob = classifier.predict_proba(X_test)

    (best_seuil, best_f1_score_train, best_y_pred) = best_prob_f1score(Y_train_prob, Y_train, .001)

    Y_test_pred = [0 if p[0] > best_seuil else 1 for p in Y_test_prob]
    f1_score_test = f1_score(np.array(Y_test), np.array(Y_test_pred))

    print()
    print("accuracy - train", accuracy_score(np.array(Y_train), np.array(best_y_pred)))
    print("f1 score - train", best_f1_score_train)
    print()
    print("accuracy - test", accuracy_score(np.array(Y_test), np.array(Y_test_pred)))
    print("f1 score - test", f1_score(np.array(Y_test), np.array(Y_test_pred)))

    print()
    print(confusion_matrix(np.array(Y_train), np.array(Y_train_pred)))
    print(confusion_matrix(np.array(Y_test), np.array(Y_test_pred)))

    df_result.loc[len(df_result)] = [model[0], model[1], best_f1_score_train, f1_score_test]



*** Modèle : V01A : Logistic Regression


accuracy - train 0.9861247766109115
f1 score - train 0.7750284877095881

accuracy - test 0.9853585400707475
f1 score - test 0.7602608362102034

[[192041    738]
 [  1969   4456]]
[[82142   479]
 [  771  1982]]

*** Modèle : V02A : SVM


accuracy - train 0.9859139374711351
f1 score - train 0.7738191197807512

accuracy - test 0.9850305713683323
f1 score - test 0.7571265678449259

[[192286    493]
 [  2298   4127]]
[[82104   517]
 [  761  1992]]

*** Modèle : V03A : Decision Tree Classifier

Fitting 5 folds for each of 27 candidates, totalling 135 fits

accuracy - train 0.9868175337844621
f1 score - train 0.7903225806451613

accuracy - test 0.9841286574366903
f1 score - test 0.7440015114301908

[[192140    639]
 [  1897   4528]]
[[82050   571]
 [  784  1969]]

*** Modèle : V04A : Random Forest Classifier

Fitting 5 folds for each of 1 candidates, totalling 5 fits

accuracy - train 0.9868878134977209
f1 score - train 0.7842748595969606

accuracy -

In [20]:
df_result

Unnamed: 0,Version,Modèle,f1Score_train,f1Score_test
0,V01A,Logistic Regression,0.775028,0.760261
1,V02A,SVM,0.773819,0.757127
2,V03A,Decision Tree Classifier,0.790323,0.744002
3,V04A,Random Forest Classifier,0.784275,0.756231
