# Projet : Conversion Rate Challenge 🏆
# Approche 2 : Traitement séparé des pays puis consolidation des prédictions
# Modèles : Logistic Regression, SVM, Decision Tree Classifier, Random Forest Classifier

## Import des modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn import svm

## Chargement des données

In [None]:
data = pd.read_csv('./data/conversion_data_train.csv')
print('Set with labels (our train) :', data.shape)

Set with labels (our train) : (284580, 6)


# Suppression outlers

In [35]:
# suppression lignes avec age >= XX
data = data[data["age"] < 100]

## Entrainement des Modèles : Logistic Regression, SVM, Decision Tree Classifier, Random Forest Classifier

### Fonction de recherche du seuil de probabilité optimun pour un meilleur f1_score

In [36]:
def best_prob_f1score(y_prob, y_real, seuil_step) :
    best_seuil = 0.5
    best_y_pred = np.array([0 if p[0] > best_seuil else 1 for p in y_prob])
    best_f1_score = f1_score(y_real, best_y_pred)

    
    for seuil in np.arange(0, 1, seuil_step) :
        y_pred = np.array([0 if p[0] > seuil else 1 for p in y_prob])
        seuil_f1_score = f1_score(y_real, y_pred)
        if seuil_f1_score > best_f1_score :
            best_seuil = seuil
            best_f1_score = seuil_f1_score
            best_y_pred = y_pred

    return (best_seuil, best_f1_score, best_y_pred)

### Traitement des entrainements et prédictions séparement pour chcun des pays
### Objectif : mieux tenir compte des disparités des désiquilibes du dataset par pays
### Normalement, Les modèles devraient tenir compte nativement du facteur "pays" mais celà semble être moins efficace

In [37]:
# Les modèles testés
models = (("V01B", "Logistic Regression"),
          ("V02B", "SVM"),
          ("V03B", "Decision Tree Classifier"),
          ("V04B", "Random Forest Classifier")
        )

# initialisation dataframe pour tracer le résultat des différents modèles
df_result = pd.DataFrame(columns = ["Version", "Modèle", "f1Score_train", "f1Score_test"])
df_result_country = pd.DataFrame(columns = ["Version", "Modèle", "country", "f1Score_train", "f1Score_test"])

# Initialisation de listes pour consolider les resultats des prédictions faites séparéments sur les différents pays 
Y_test_total = []
Y_test_pred_total = []
Y_test_prob_total = []
Y_train_total = []
Y_train_pred_total = []
Y_train_prob_total = []


for model in models :
    print("\n*** Modèle : " + model[0] + " : " + model[1] + "\n")
    
    for country in [["Germany"], ["US"], ["UK"], ["China"]] :

        num_features = ["age", "total_pages_visited"]
        cat_features = ["source", "new_user"]
        features_list = num_features + cat_features
        target_variable = 'converted'

        dataset = data[(data["country"].apply(lambda x : True if x in country else False))]
        X = dataset.loc[:, features_list]
        Y = dataset.loc[:, target_variable]

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y ,random_state=27)

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler()),])
        categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop="first")),])
        preprocessor = ColumnTransformer(transformers=[
                ("num", numeric_transformer, num_features),
                ("cat", categorical_transformer, cat_features),
            ]
        )

        X_train = preprocessor.fit_transform(X_train)
        X_test = preprocessor.transform(X_test)

        # Logistic Regression
        if model[0] ==  "V01B" :
            classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)
            classifier.fit(X_train, Y_train)
      
        # SVM
        if model[0] ==  "V02B" :
            classifier = svm.SVC(kernel="poly", probability=True, C=10, random_state=42)
            classifier.fit(X_train, Y_train)

        # Decision Tree Classifier
        if model[0] ==  "V03B" :
            # model = DecisionTreeClassifier()
            params = {
                'max_depth':[12, 14, 16],
                'min_samples_split':[14,18,22],
                'min_samples_leaf':[4,6,8]
            }
            gridsearch = GridSearchCV(DecisionTreeClassifier(), param_grid = params, scoring='f1', cv=5, verbose=1)
            gridsearch.fit(X_train, Y_train)
            classifier = gridsearch.best_estimator_


        # Random Forest Classifier
        if model[0] ==  "V04B" :
            # model = RandomForestClassifier()
            params={
               'max_depth':[9],
               'min_samples_split':[9],
               'n_estimators':[40]
            }
            gridsearch = GridSearchCV(RandomForestClassifier(), param_grid = params, scoring='f1', cv=5, verbose=1)
            gridsearch.fit(X_train, Y_train)
            classifier = gridsearch.best_estimator_


        # Predictions on training set
        Y_train_pred = classifier.predict(X_train)
        Y_train_prob = classifier.predict_proba(X_train)
        
        # Predictions on test set
        Y_test_pred = classifier.predict(X_test)
        Y_test_prob = classifier.predict_proba(X_test)
        
        (best_seuil, best_f1_score_train, best_y_pred) = best_prob_f1score(Y_train_prob, Y_train, .001)

        Y_test_pred = [0 if p[0] > best_seuil else 1 for p in Y_test_prob]
        f1_score_test = f1_score(np.array(Y_test), np.array(Y_test_pred))
        print(f"{country}\t\tseuil : {best_seuil}\tf1_score_train : {best_f1_score_train}\tf1_score_test : {f1_score_test}", X_train.shape)

        df_result_country.loc[len(df_result_country)] = [model[0], model[1], country, best_f1_score_train, f1_score_test]

        Y_test_total += list(Y_test)
        Y_test_pred_total += [0 if p[0] > best_seuil else 1 for p in Y_test_prob]
        # Y_test_prob_total += [p for p in Y_test_prob]
        Y_test_prob_total += list(Y_test_prob)
        Y_train_total += list(Y_train)
        Y_train_pred_total += [0 if p[0] > best_seuil else 1 for p in Y_train_prob]
        # Y_train_prob_total += [p for p in Y_train_prob]
        Y_train_prob_total += list(Y_train_prob)

    print()
    print("accuracy - train", accuracy_score(np.array(Y_train_total), np.array(Y_train_pred_total)))
    print("f1 score - train", f1_score(np.array(Y_train_total), np.array(Y_train_pred_total)))
    print()
    print("accuracy - test", accuracy_score(np.array(Y_test_total), np.array(Y_test_pred_total)))
    print("f1 score - test", f1_score(np.array(Y_test_total), np.array(Y_test_pred_total)))

    print()
    print(confusion_matrix(np.array(Y_train_total), np.array(Y_train_pred_total)))
    print(confusion_matrix(np.array(Y_test_total), np.array(Y_test_pred_total)))

    df_result.loc[len(df_result)] = [model[0], model[1], f1_score(np.array(Y_train_total), np.array(Y_train_pred_total)), f1_score(np.array(Y_test_total), np.array(Y_test_pred_total))]



*** Modèle : V01B : Logistic Regression

['Germany']		seuil : 0.617	f1_score_train : 0.8088531187122736	f1_score_test : 0.8158508158508159 (8184, 5)
['US']		seuil : 0.549	f1_score_train : 0.7666116122474907	f1_score_test : 0.7613670133729569 (112086, 5)
['UK']		seuil : 0.657	f1_score_train : 0.7866500311915159	f1_score_test : 0.7753934191702432 (30548, 5)
['China']		seuil : 0.833	f1_score_train : 0.42016806722689076	f1_score_test : 0.4262295081967213 (48385, 5)

accuracy - train 0.9860443868817237
f1 score - train 0.7719442165709598

accuracy - test 0.9855812591508053
f1 score - test 0.765657719398439

[[191718   1061]
 [  1719   4705]]
[[82133   488]
 [  743  2011]]

*** Modèle : V02B : SVM

['Germany']		seuil : 0.922	f1_score_train : 0.810379241516966	f1_score_test : 0.8211009174311926 (8184, 5)
['US']		seuil : 0.891	f1_score_train : 0.7661829652996845	f1_score_test : 0.7641176470588236 (112086, 5)
['UK']		seuil : 0.924	f1_score_train : 0.7876301672451878	f1_score_test : 0.774474256

In [38]:
display(df_result)
display(df_result_country)

Unnamed: 0,Version,Modèle,f1Score_train,f1Score_test
0,V01B,Logistic Regression,0.771944,0.765658
1,V02B,SVM,0.771915,0.76661
2,V03B,Decision Tree Classifier,0.776831,0.76096
3,V04B,Random Forest Classifier,0.780732,0.760085


Unnamed: 0,Version,Modèle,country,f1Score_train,f1Score_test
0,V01B,Logistic Regression,[Germany],0.808853,0.815851
1,V01B,Logistic Regression,[US],0.766612,0.761367
2,V01B,Logistic Regression,[UK],0.78665,0.775393
3,V01B,Logistic Regression,[China],0.420168,0.42623
4,V02B,SVM,[Germany],0.810379,0.821101
5,V02B,SVM,[US],0.766183,0.764118
6,V02B,SVM,[UK],0.78763,0.774474
7,V02B,SVM,[China],0.144928,0.210526
8,V03B,Decision Tree Classifier,[Germany],0.825996,0.783019
9,V03B,Decision Tree Classifier,[US],0.77786,0.752651
