In [52]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [53]:
# Chargement des données dans un dataframe
data = pd.read_csv("logs_extract_test_technique_pubmedia.csv")

In [54]:
# Création d'un champ 'converted' basé sur l'existence du champ Date et TIMESTAMP
data['converted'] = data['Date'] == data['TIMESTAMP']

In [55]:
data.head()

Unnamed: 0,Visitor_ID,Order_ID,Session_ID,TIMESTAMP,channel,Date,device,interaction,converted
0,62b8a6f7a78a95099e96208e5a5fddbf,VGNDTH,2021081109450110503075681,2021-08-11 10:50:38,SEO,2021-08-11 10:50:38,web,CLICK,True
1,70919f0bc2ffbfb042f9e744865967c8,Q9MDCD,2021081221100511925403162,2021-08-12 21:10:05,SEO,2021-08-12 21:10:05,mobile,CLICK,True
2,b0b049969e05a45cd73be135cf28983d,8ZWECQ,202107240904151573704685,2021-07-24 09:04:15,SEO,2021-07-24 09:04:15,desktop,CLICK,True
3,a87e36dbfc7b606b70103ca2a631bf4d,V45QTS,202107201329103861926169,2021-07-20 13:38:43,SEO,2021-07-20 14:17:31,desktop,CLICK,False
4,49eb15be3913a6cc5e7e98ad7c8e1fef,6T7G4U,202107090923521930686502,2021-07-09 09:23:52,SEO,2021-08-01 16:25:38,desktop,CLICK,False


In [56]:
# One hot encode the 'channel' field
encoder = OneHotEncoder()
channel_encoded = encoder.fit_transform(data[['channel']])
channel_df = pd.DataFrame(channel_encoded.toarray(), columns=encoder.get_feature_names_out(['channel']))

In [57]:
# Concatenate the encoded channel data with the original data
data = pd.concat([data, channel_df], axis=1)

In [58]:
data.head()

Unnamed: 0,Visitor_ID,Order_ID,Session_ID,TIMESTAMP,channel,Date,device,interaction,converted,channel_AD_EXCHANGE_appnexus,...,channel_DISPLAY,channel_REFERRAL,channel_SEA_Bing_HM,channel_SEA_Bing_M,channel_SEA_Google_HM,channel_SEA_Google_M,channel_SEO,channel_SOCIAL,channel_email_auto,channel_email_tactique
0,62b8a6f7a78a95099e96208e5a5fddbf,VGNDTH,2021081109450110503075681,2021-08-11 10:50:38,SEO,2021-08-11 10:50:38,web,CLICK,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,70919f0bc2ffbfb042f9e744865967c8,Q9MDCD,2021081221100511925403162,2021-08-12 21:10:05,SEO,2021-08-12 21:10:05,mobile,CLICK,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,b0b049969e05a45cd73be135cf28983d,8ZWECQ,202107240904151573704685,2021-07-24 09:04:15,SEO,2021-07-24 09:04:15,desktop,CLICK,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,a87e36dbfc7b606b70103ca2a631bf4d,V45QTS,202107201329103861926169,2021-07-20 13:38:43,SEO,2021-07-20 14:17:31,desktop,CLICK,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,49eb15be3913a6cc5e7e98ad7c8e1fef,6T7G4U,202107090923521930686502,2021-07-09 09:23:52,SEO,2021-08-01 16:25:38,desktop,CLICK,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [59]:
# Séparez les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Visitor_ID','Order_ID','Session_ID','converted','channel','Date','TIMESTAMP','device','interaction'], axis=1), data['converted'], test_size=0.2)

In [60]:
# Entraînez un classificateur de forêt aléatoire sur les données d'entraînement
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [61]:
# Use the trained model to predict the probability of conversion for each channel
probs = clf.predict_proba(X_test)

In [62]:
clf.predict(X_test)

array([0., 0., 0., ..., 0., 0., 0.])

In [63]:
# Attribution d'un score de contribution pour chaque canal
contributions = probs[:,1].dot(X_test[channel_df.columns])


In [64]:
# Check point pour voir si le modèle retourne qu'une seule classe
if probs.shape[1] == 1:
    raise ValueError("The model is not returning the probability of conversion for each class.")

In [65]:
# Print des scores de contributions pour chaque canal (channel)
for i,channel in enumerate(channel_df.columns):
    print(f"{channel} contribution:", contributions[i].sum())


channel_AD_EXCHANGE_appnexus contribution: 170.49641851446958
channel_AFFILIATION contribution: 55.752636447629946
channel_DIRECT_ACCESS contribution: 933.668861178262
channel_DISPLAY contribution: 1.0635246428315632
channel_REFERRAL contribution: 177.73601530681515
channel_SEA_Bing_HM contribution: 0.0
channel_SEA_Bing_M contribution: 21.090915240215654
channel_SEA_Google_HM contribution: 12.86723849222682
channel_SEA_Google_M contribution: 263.2783812212656
channel_SEO contribution: 1077.7567865486055
channel_SOCIAL contribution: 2.758235358624486
channel_email_auto contribution: 88.55735184085573
channel_email_tactique contribution: 11.853143159314513
