<hr>

# ðŸ”§ DATA PREPROCESSING ðŸ”§

<style>
h1 {
    text-align: center;
    color: yellow;
    font-weight: bold;
}
</style>

<style>
h2 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<style>
h3 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<style>
h4 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<hr>

This dataset is made for Machine Learning and needs to be feature engineered for suitable Classifier models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("../data/processed/online_shoppers_intention_01_standard.csv")

In [3]:
df.dtypes

admin                      int64
admin_duration           float64
info                       int64
info_duration            float64
prod_related               int64
prod_related_duration    float64
bounce_rate              float64
exit_rate                float64
page_value               float64
special_day              float64
month                        str
os                         int64
browser                    int64
region                     int64
traffic_type               int64
visitor_type                 str
weekend                     bool
revenue                     bool
dtype: object

## Splitting the data

In [4]:
X = df.drop("revenue", axis=1)
y = df["revenue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, #for a mid-sized dataset, 20% is ideal
    random_state=42, #for reproducibility
    )

## Encoding the nominal columns in the dataset

In [5]:
#Nominal columns
nominal_cols = ["month", "visitor_type", "os", "browser", "region", "traffic_type"]

X_train_nom = X_train[nominal_cols]
X_test_nom  = X_test[nominal_cols]

#Numerical columns
X_train_num = X_train.drop(columns=nominal_cols)
X_test_num  = X_test.drop(columns=nominal_cols)

#OneHotEncoder
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False) #for safe handling unseen categories in test set, and to get an array output
ohe.fit(X_train_nom)

X_train_nom_enc = ohe.transform(X_train_nom)
X_test_nom_enc  = ohe.transform(X_test_nom)

#Back to DataFrame
encoded_cols = ohe.get_feature_names_out(nominal_cols) #to create new column names for the encoded features

X_train_nom_df = pd.DataFrame(X_train_nom_enc,
                              columns=encoded_cols,
                              index=X_train.index)

X_test_nom_df = pd.DataFrame(X_test_nom_enc,
                             columns=encoded_cols,
                             index=X_test.index)

#Concat
X_train_final = pd.concat([X_train_num, X_train_nom_df], axis=1)
X_test_final  = pd.concat([X_test_num, X_test_nom_df], axis=1)

In [6]:
X_train_nom_df.head()


Unnamed: 0,month_Aug,month_Dec,month_Feb,month_Jul,month_June,month_Mar,month_May,month_Nov,month_Oct,month_Sep,...,traffic_type_11,traffic_type_12,traffic_type_13,traffic_type_14,traffic_type_15,traffic_type_16,traffic_type_17,traffic_type_18,traffic_type_19,traffic_type_20
1785,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
286,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_test_nom_df.head()

Unnamed: 0,month_Aug,month_Dec,month_Feb,month_Jul,month_June,month_Mar,month_May,month_Nov,month_Oct,month_Sep,...,traffic_type_11,traffic_type_12,traffic_type_13,traffic_type_14,traffic_type_15,traffic_type_16,traffic_type_17,traffic_type_18,traffic_type_19,traffic_type_20
8916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
772,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7793,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6601,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Saving the encoded, split datasets as CSV files for later use

In [8]:
X_train_final.to_csv("../data/processed/X_train.csv", index=False)
X_test_final.to_csv("../data/processed/X_test.csv", index=False)

y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

In [9]:
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")   
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9864, 74)
(2466, 74)
(9864, 1)
(2466, 1)


In [11]:
X_train.head()


Unnamed: 0,admin,admin_duration,info,info_duration,prod_related,prod_related_duration,bounce_rate,exit_rate,page_value,special_day,...,traffic_type_11,traffic_type_12,traffic_type_13,traffic_type_14,traffic_type_15,traffic_type_16,traffic_type_17,traffic_type_18,traffic_type_19,traffic_type_20
0,0,0.0,0,0.0,7,95.0,0.014286,0.061905,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,14.0,0,0.0,81,1441.910588,0.002469,0.013933,2.769599,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,49.2,4,379.0,5,74.6,0.0,0.018182,8.326728,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,1,5.0,9,279.0,0.04,0.041667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
