Deep Learning MLP – Predicting User Behavior on Websites
The goal of this lesson is to create a multilayer perceptron deep learning model with Keras to predict user behavior on an online shopping website.

Revenue indicates whether user has bought

In [9]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv('ds/online_shoppers_intention.csv', sep=',')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [10]:
df.isnull().sum()  # analyze null data
# If null we can exclude

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [11]:
set(df['Revenue'])  # target class fits the domain?

{False, True}

In [15]:
# Dataset can have bias in elements order

df = shuffle(df)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
10653,0,0.0,0,0.0,3,25.5,0.066667,0.133333,0.0,0.0,Nov,1,1,3,2,Returning_Visitor,True,False
222,0,0.0,0,0.0,4,14.0,0.1,0.15,0.0,0.0,Mar,3,2,1,1,Returning_Visitor,False,False
791,0,0.0,0,0.0,13,129.5,0.061538,0.092308,0.0,0.0,Mar,1,8,1,1,Returning_Visitor,False,False
5053,0,0.0,0,0.0,19,801.583333,0.010526,0.035088,0.0,1.0,May,1,1,1,2,Returning_Visitor,True,False
6539,1,23.2,0,0.0,0,0.0,0.0,0.033333,0.0,0.0,Oct,2,2,6,3,Returning_Visitor,False,True


In [16]:
set(df['Month'])

{'Aug', 'Dec', 'Feb', 'Jul', 'June', 'Mar', 'May', 'Nov', 'Oct', 'Sep'}

In [17]:
df.shape

(12330, 18)

In [19]:
# Notice the target class is unbalanced
df['Revenue'].value_counts(normalize=True)

Revenue
False    0.845255
True     0.154745
Name: proportion, dtype: float64

In [20]:
# Month, VisitorType, Weekend, Revenue
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()  # transform categorical data to id numbers
df['Month'] = le.fit_transform(df['Month'])
df['VisitorType'] = le.fit_transform(df['VisitorType'])
df['Weekend'] = le.fit_transform(df['Weekend'])
df['Revenue'] = le.fit_transform(df['Revenue'])

df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
10653,0,0.0,0,0.0,3,25.5,0.066667,0.133333,0.0,0.0,7,1,1,3,2,2,1,0
222,0,0.0,0,0.0,4,14.0,0.1,0.15,0.0,0.0,5,3,2,1,1,2,0,0
791,0,0.0,0,0.0,13,129.5,0.061538,0.092308,0.0,0.0,5,1,8,1,1,2,0,0
5053,0,0.0,0,0.0,19,801.583333,0.010526,0.035088,0.0,1.0,6,1,1,1,2,2,1,0
6539,1,23.2,0,0.0,0,0.0,0.0,0.033333,0.0,0.0,8,2,2,6,3,2,0,1


In [21]:
# Split training and test data (holdout)

X = df.drop('Revenue', axis=1)
y = df['Revenue']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,  # ensures the ratio of target classes in the train and test sets are the same
    test_size=0.20,
    random_state=42
)

print(X_train.shape)
print(X_test.shape)

(9864, 17)
(2466, 17)


MinMaxScaler

$$
X’ = \frac{X - X_{\min}}{X_{\max} - X_{\min}}
$$

Data Leakage (vazamento de dados): Occurs when information from the test set (or from the future, in time series problems) ends up being used, directly or indirectly, to train the model. This makes the model appear much better in testing than it will be in practice, because it has already seen hints of the answer during training.



In [22]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)  # Applies only to train data to avoid data leakage

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

* sigmoid retorna valores entre 0 e 1. Associado à binary_crossentropy, é a escolha clássica para classificação binária
* relu é usado nas camadas ocultas por ser simples e eficiente, e também evita o problema do vanishing gradient, comum quando se usa sigmoid e tanh
* binary_crossentropy:
$$\text{Binary Crossentropy} = - \frac{1}{N} \sum_{i=1}^{N} \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right]$$
* $N$: número de amostras
* $y_i$ = rótulo real (0 ou 1)
* $\hat{y}_i$ = probabilidade prevista pelo modelo (entre 0 e 1)
* Intuição:
  * Se y = 1, o termo que importa é $-\log(\hat{y})$ → quanto mais próximo de 1 for $\hat{y}$, menor a perda.
  * Se y = 0, o termo que importa é $-\log(1 - \hat{y})$ → quanto mais próximo de 0 for $\hat{y}$, menor a perda.
  * Isso força o modelo a dar alta probabilidade para a classe correta.
  * O log aumenta o valor quanto mais próximo for o dado de 0, e diminui o valor quanto mais próximo de 1, de forma que log(1) = 0
* Exemplo:
  * Se o rótulo real é 1 e o modelo previu 0.9:
$$-\log(0.9) \approx 0.105$$
(erro baixo, pois o acerto foi bom)
  * Se o rótulo real é 1 e o modelo previu 0.1:
$$-\log(0.1) \approx 2.302$$
(erro alto, porque o modelo estava confiante, mas errado)


In [23]:
import tensorflow as tf
from keras import models
from keras import layers
from keras.optimizers import Adam

tf.random.set_seed(9)

input_shape = X_train.shape[1]  # Input variables
output_shape = 1  # Binary result

model = models.Sequential()

model.add(layers.Dense(
    32,
    input_shape=(input_shape,),
    activation='relu'
))

model.add(layers.Dense(
    28,
    activation='relu'
))

model.add(layers.Dropout(0.2))

model.add(layers.Dense(
    18,
    activation='relu'
))

model.add(layers.Dense(
    output_shape,
    activation='sigmoid'
))

optimizer = Adam(learning_rate=0.001)
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-08-13 18:28:39.813262: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-08-13 18:28:39.813305: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-08-13 18:28:39.813311: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1755120519.813345 7769486 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1755120519.813394 7769486 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
from tqdm.keras import TqdmCallback


# Configurando as épocas de processamento para a convergência do erro da função de custo
# Backpropagation
hist = model.fit(X_train,
                 y_train,
                 epochs=100,
                 batch_size=200,
                 shuffle=True,
                 validation_data=(X_test, y_test),
                 verbose=1,
                 callbacks=[TqdmCallback(verbose=0)]
                 )

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualizando os resultados de treino

# acurácia
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

# erro
loss = hist.history['loss']
val_loss = hist.history['val_loss']

epoch = 100
epochs_range = range(epoch)

# Plot Acurácia
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Acurácia de Treinamento')
plt.plot(epochs_range, val_acc, label='Acurácia de Validação')
plt.legend(loc='lower right')
plt.title('Acurácia de treino e teste')

# Plot Erro de treinamento
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Erro de treinamento')
plt.plot(epochs_range, val_loss, label='Erro de Validação')
plt.legend(loc='upper right')
plt.title('Erro de treinamento vs validação')
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Predictions
y_pred = model.predict(X_test)
y_pred_class = [round(x[0]) for x in y_pred]
y_test_class = y_test

print(classification_report(y_test_class, y_pred_class, target_names=['False', 'True']))

# Repetir treino e teste