<hr>

# ðŸ”§ DATA PREPROCESSING ðŸ”§

<style>
h1 {
    text-align: center;
    color: yellow;
    font-weight: bold;
}
</style>

<style>
h2 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<style>
h3 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<style>
h4 {
    text-align: center;
    color: black;
    font-weight: bold;
}
</style>

<hr>

This dataset is made for Machine Learning and needs to be feature engineered for suitable Classifier models

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [12]:
df = pd.read_csv("../data/processed/online_shoppers_intention_01_standard.csv")

In [13]:
df.dtypes

admin                      int64
admin_duration           float64
info                       int64
info_duration            float64
prod_related               int64
prod_related_duration    float64
bounce_rate              float64
exit_rate                float64
page_value               float64
special_day              float64
month                     object
os                         int64
browser                    int64
region                     int64
traffic_type               int64
visitor_type              object
weekend                     bool
revenue                     bool
dtype: object

## Splitting the data

In [14]:
X = df.drop("revenue", axis=1)
y = df["revenue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, #for a mid-sized dataset, 20% is ideal
    random_state=42, #for reproducibility
    )

## Encoding the nominal columns in the dataset

In [15]:
#Nominal columns
nominal_cols = ["month", "visitor_type"]

X_train_nom = X_train[nominal_cols]
X_test_nom  = X_test[nominal_cols]

#Numerical columns
X_train_num = X_train.drop(columns=nominal_cols)
X_test_num  = X_test.drop(columns=nominal_cols)

#OneHotEncoder
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False) #for safe handling unseen categories in test set, and to get an array output
ohe.fit(X_train_nom)

X_train_nom_enc = ohe.transform(X_train_nom)
X_test_nom_enc  = ohe.transform(X_test_nom)

#Back to DataFrame
encoded_cols = ohe.get_feature_names_out(nominal_cols) #to create new column names for the encoded features

X_train_nom_df = pd.DataFrame(X_train_nom_enc,
                              columns=encoded_cols,
                              index=X_train.index)

X_test_nom_df = pd.DataFrame(X_test_nom_enc,
                             columns=encoded_cols,
                             index=X_test.index)

#Concat
X_train_final = pd.concat([X_train_num, X_train_nom_df], axis=1)
X_test_final  = pd.concat([X_test_num, X_test_nom_df], axis=1)

In [16]:
X_train_nom_df.head()


Unnamed: 0,month_Aug,month_Dec,month_Feb,month_Jul,month_June,month_Mar,month_May,month_Nov,month_Oct,month_Sep,visitor_type_New_Visitor,visitor_type_Other,visitor_type_Returning_Visitor
1785,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
286,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
12251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [17]:
X_test_nom_df.head()

Unnamed: 0,month_Aug,month_Dec,month_Feb,month_Jul,month_June,month_Mar,month_May,month_Nov,month_Oct,month_Sep,visitor_type_New_Visitor,visitor_type_Other,visitor_type_Returning_Visitor
8916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
772,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7793,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6601,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Saving the encoded, split datasets as CSV files for later use

In [18]:
X_train_final.to_csv("../data/processed/X_train.csv", index=False)
X_test_final.to_csv("../data/processed/X_test.csv", index=False)

y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

### How to load it in the ML model notebooks

In [None]:
import pandas as pd

X_train = pd.read_csv("X_train.csv")
X_test  = pd.read_csv("X_test.csv")

y_train = pd.read_csv("y_train.csv").values.ravel()
y_test  = pd.read_csv("y_test.csv").values.ravel()