#  Data Preprocessing

## Objective
This notebook handles all data preprocessing steps required before model training:
- Handle missing values and data quality issues
- Encode categorical variables for machine learning
- Scale numerical features appropriately
- Save preprocessed datasets for modeling

## Preprocessing Strategy
Based on our EDA findings:
- **No missing values** 
- **Imbalanced target** 
- **Mixed data types** 
- **Duplicates  Rows** 

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import warnings
import os

## Data Loading 

In [21]:
df = pd.read_csv('../data/raw/online_shoppers_intention.csv')
df.sample(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
2487,0,0.0,0,0.0,69,653.277778,0.014706,0.053431,0.0,0.6,May,2,4,7,4,Returning_Visitor,False,False
2333,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,1.0,May,1,1,4,1,Returning_Visitor,True,False
1113,1,464.0,0,0.0,19,1834.083333,0.02,0.03125,0.0,0.0,Mar,2,2,2,10,Returning_Visitor,False,False
2069,7,120.5,0,0.0,19,1011.5,0.0,0.009524,0.0,0.0,Mar,2,2,1,1,Returning_Visitor,False,False
7772,0,0.0,0,0.0,42,602.883333,0.0,0.004365,21.266325,0.0,Sep,6,2,3,1,Returning_Visitor,False,False


## Convert encoded ID columns to categorical dtype

In [22]:
df["OperatingSystems"]=df["OperatingSystems"].astype('object')
df["Browser"]=df["Browser"].astype('object')
df["Region"]=df["Region"].astype('object')
df["TrafficType"]=df["TrafficType"].astype('object')

In [23]:
numerical_features=[
  "Administrative",
  "Administrative_Duration",
  "Informational",
  "Informational_Duration",
  "ProductRelated",
  "ProductRelated_Duration",
  "BounceRates",
  "ExitRates",
  "PageValues",
  "SpecialDay"
]


categorical_features=[
 'Month',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
]

target = 'Revenue'


## Duplicates and Missing values

In [24]:
missing_count = df.isnull().sum().sum()
print(f"Missing values: {missing_count}")

duplicate_count = df.duplicated().sum()
print(f"Duplicate rows: {duplicate_count}")

# Data types
print(f"\nData types check:")
for col in df.columns:
    dtype = df[col].dtype
    unique_vals = df[col].nunique()
    print(f"  {col:<25}: {str(dtype):<10} ({unique_vals} unique values)")

Missing values: 0
Duplicate rows: 125

Data types check:
  Administrative           : int64      (27 unique values)
  Administrative_Duration  : float64    (3335 unique values)
  Informational            : int64      (17 unique values)
  Informational_Duration   : float64    (1258 unique values)
  ProductRelated           : int64      (311 unique values)
  ProductRelated_Duration  : float64    (9551 unique values)
  BounceRates              : float64    (1872 unique values)
  ExitRates                : float64    (4777 unique values)
  PageValues               : float64    (2704 unique values)
  SpecialDay               : float64    (6 unique values)
  Month                    : object     (10 unique values)
  OperatingSystems         : object     (8 unique values)
  Browser                  : object     (13 unique values)
  Region                   : object     (9 unique values)
  TrafficType              : object     (20 unique values)
  VisitorType              : object     (3 uniqu

## Drop duplicates

In [25]:
df = df.drop_duplicates().reset_index(drop=True)

In [26]:
duplicate_count = df.duplicated().sum()
print(f"Duplicate rows: {duplicate_count}")

Duplicate rows: 0


## Categorical Variable Encoding

In [27]:
df_processed = df.copy()

# Convert Weekend to numerical (TRUE/FALSE to 1/0)
df_processed['Weekend'] = df_processed['Weekend'].map({'TRUE': 1, 'False': 0, True: 1, False: 0})
print(f"Weekend converted to numerical: {df_processed['Weekend'].unique()}")

# Convert target variable to numerical (TRUE/FALSE to 1/0)
df_processed['Revenue'] = df_processed['Revenue'].map({'TRUE': 1, 'False': 0, True: 1, False: 0})
print(f"Revenue converted to numerical: {df_processed['Revenue'].unique()}")


Weekend converted to numerical: [0 1]
Revenue converted to numerical: [0 1]


In [28]:
df_processed.sample(2)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
11059,7,1463.333333,0,0.0,30,1410.238889,0.005263,0.010557,0.0,0.0,Nov,1,1,4,2,New_Visitor,0,0
297,5,227.333333,0,0.0,17,528.9,0.0,0.002273,0.0,0.0,Mar,1,1,1,9,Returning_Visitor,1,0


In [29]:
from sklearn.preprocessing import LabelEncoder
import joblib     


cat_cols   = [
    "Month",
    "OperatingSystems",
    "Browser",
    "Region",
    "TrafficType",
    "VisitorType"
]
bool_col   = "Weekend"      
target_col = "Revenue"      


encoders = {}                      

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le            


df[bool_col] = df[bool_col].astype(int)
df[target_col] = df[target_col].astype(int)

joblib.dump(encoders, "../models/label_encoders.pkl")


X = df.drop(columns=[target_col])
y = df[target_col]


In [30]:
X

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,0,0,0,0,2,0
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,2,1,5,0,11,2,0
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,3,0,8,13,2,0
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,2,2,5,1,14,2,0
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,2,2,6,0,14,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12200,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,1,3,9,0,0,2,1
12201,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,7,2,5,0,18,2,1
12202,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,7,2,5,0,4,2,1
12203,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,7,1,5,2,2,2,0


In [31]:
y

0        0
1        0
2        0
3        0
4        0
        ..
12200    0
12201    0
12202    0
12203    0
12204    0
Name: Revenue, Length: 12205, dtype: int32

In [32]:
from sklearn.model_selection import train_test_split

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,  
    stratify=y        
)


print(f"X_train shape: {X_train.shape}")
print(f"X_test shape : {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape : {y_test.shape}")

print(f"\nTraining set 'Revenue' distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nTest set 'Revenue' distribution:\n{y_test.value_counts(normalize=True)}")

X_train shape: (9764, 17)
X_test shape : (2441, 17)
y_train shape: (9764,)
y_test shape : (2441,)

Training set 'Revenue' distribution:
Revenue
0    0.843712
1    0.156288
Name: proportion, dtype: float64

Test set 'Revenue' distribution:
Revenue
0    0.843507
1    0.156493
Name: proportion, dtype: float64


## Feature Scaling

In [33]:
from sklearn.preprocessing import StandardScaler


print("Numerical features to be scaled:")
print(numerical_features)


scaler = StandardScaler()

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

X_test[numerical_features] = scaler.transform(X_test[numerical_features])

print("\nScaled X_train sample:")
display(X_train.head())

scaler_path = "../models/scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"\nScaler saved to {scaler_path}")

Numerical features to be scaled:
['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

Scaled X_train sample:


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
7469,1.689206,0.676238,0.38175,-0.243015,0.220868,-0.080395,-0.448703,-0.800192,1.665596,-0.311837,7,1,5,4,11,2,0
11804,-0.699498,-0.458067,-0.397879,-0.243015,-0.0469,-0.158842,-0.448703,-0.739774,1.495448,-0.311837,1,1,5,6,11,0,0
3250,-0.699498,-0.458067,-0.397879,-0.243015,-0.426237,-0.408089,-0.448703,-0.287446,-0.315264,3.689185,6,0,0,0,11,2,0
204,-0.102322,-0.235218,-0.397879,-0.243015,-0.604748,-0.336795,-0.448703,-0.279507,-0.315264,-0.311837,5,1,5,6,11,0,1
4392,-0.699498,-0.458067,-0.397879,-0.243015,-0.359295,0.203501,-0.448703,-0.609743,-0.315264,-0.311837,6,3,5,1,4,2,0



Scaler saved to ../models/scaler.pkl


## Save Preprocessed Data

In [34]:
processed_data_path = '../data/processed/'



X_train.to_csv(os.path.join(processed_data_path, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_data_path, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(processed_data_path, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(processed_data_path, 'y_test.csv'), index=False)

print(f"Processed data saved to: {processed_data_path}")
print("\n--- Data Preprocessing Complete ---")
print("The following files are now ready for the modeling phase:")
print(f" - X_train.csv (Shape: {X_train.shape})")
print(f" - X_test.csv  (Shape: {X_test.shape})")
print(f" - y_train.csv (Shape: {y_train.shape})")
print(f" - y_test.csv  (Shape: {y_test.shape})")

Processed data saved to: ../data/processed/

--- Data Preprocessing Complete ---
The following files are now ready for the modeling phase:
 - X_train.csv (Shape: (9764, 17))
 - X_test.csv  (Shape: (2441, 17))
 - y_train.csv (Shape: (9764,))
 - y_test.csv  (Shape: (2441,))
