## Data Pre-processing

In [1]:
# !pip install pyarrow

In [4]:
# !pip install sidetable

Collecting sidetable
  Using cached sidetable-0.9.1-py3-none-any.whl.metadata (45 kB)
Using cached sidetable-0.9.1-py3-none-any.whl (19 kB)
Installing collected packages: sidetable
Successfully installed sidetable-0.9.1


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### 1. Importing libraries

In [80]:
import pandas as pd

import seaborn as sns

import os
import matplotlib.pyplot as plt

import numpy as np
import sidetable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder


In [6]:
os.getcwd()

'C:\\Users\\40101410\\OneDrive - Anheuser-Busch InBev\\Upskill2024'

In [7]:
os.listdir()

['.git',
 '.ipynb_checkpoints',
 '.python-version',
 '.venv',
 'Data',
 'direct-marketing-eda.ipynb',
 'direct-marketing-pre-processing.ipynb',
 'hello.py',
 'main.ipynb',
 'pyproject.toml',
 'raw_data.parquet',
 'README.md',
 'uv.lock']

In [66]:
# Saving data for further pre-processing
df = pd.read_parquet('raw_data.parquet', engine='pyarrow')
df.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent
0,Old,Female,Own,Single,Far,47500,0,High,6,755
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318
2,Young,Female,Rent,Single,Close,13500,0,Low,18,296
3,Middle,Male,Own,Married,Close,85600,1,High,18,2436
4,Middle,Female,Own,Single,Close,68400,0,High,12,1304


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Age          1000 non-null   object
 1   Gender       1000 non-null   object
 2   OwnHome      1000 non-null   object
 3   Married      1000 non-null   object
 4   Location     1000 non-null   object
 5   Salary       1000 non-null   int64 
 6   Children     1000 non-null   int64 
 7   History      1000 non-null   object
 8   Catalogs     1000 non-null   int64 
 9   AmountSpent  1000 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 78.2+ KB


In [68]:
# Change number of Children
df = df.astype({'Children': 'object', 'Catalogs': 'object'})

In [69]:
categorical_variables  = df.select_dtypes(include=['object']).columns
categorical_variables

Index(['Age', 'Gender', 'OwnHome', 'Married', 'Location', 'Children',
       'History', 'Catalogs'],
      dtype='object')

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Age          1000 non-null   object
 1   Gender       1000 non-null   object
 2   OwnHome      1000 non-null   object
 3   Married      1000 non-null   object
 4   Location     1000 non-null   object
 5   Salary       1000 non-null   int64 
 6   Children     1000 non-null   object
 7   History      1000 non-null   object
 8   Catalogs     1000 non-null   object
 9   AmountSpent  1000 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 78.2+ KB


In [71]:
categorical_variables = categorical_variables.tolist()
categorical_variables

['Age',
 'Gender',
 'OwnHome',
 'Married',
 'Location',
 'Children',
 'History',
 'Catalogs']

In [72]:
numerical_variables  = df.select_dtypes(include=['int64']).columns
numerical_variables

Index(['Salary', 'AmountSpent'], dtype='object')

In [73]:
numerical_columns = numerical_variables.tolist()
numerical_columns

['Salary', 'AmountSpent']

In [103]:
nominal_columns = ['Gender', 'OwnHome', 'Married', 'Location']
ordinal_columns = ['Children', 'Catalogs']

Age_order = [["Young", "Middle", "Old"]]
History_order = [['None', 'Low', 'Medium', 'High']]


### Define Processors

In [104]:
# Numerical columns: StandardScaler
numerical_transformer = StandardScaler()

# Nominal Catgorical Variables
nominal_encoder = OneHotEncoder()

# Ordinal Catgorical Variables
ordinal_encoder = LabelEncoder()

### Combine using ColumnTransformer

In [105]:
# Encode nominal features using OneHotEncoder
# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), nominal_columns),
        ("ordinal_age", OrdinalEncoder(categories = Age_order), ['Age']),
        ("ordinal_history", OrdinalEncoder(categories = History_order), ['History']),
        ("ordinal", OrdinalEncoder(), ordinal_columns),
        ("num", StandardScaler(), numerical_columns)],
    remainder="passthrough"  # Keep other columns (e.g., Age) as is
)

# Apply the transformation
transformed_data = preprocessor.fit_transform(df)

In [106]:
# Convert to data frame
transformed_df = pd.DataFrame(
    transformed_data,
    columns=preprocessor.get_feature_names_out()
)

In [107]:
transformed_df.head()

Unnamed: 0,onehot__Gender_Female,onehot__Gender_Male,onehot__OwnHome_Own,onehot__OwnHome_Rent,onehot__Married_Married,onehot__Married_Single,onehot__Location_Close,onehot__Location_Far,ordinal_age__Age,ordinal_history__History,ordinal__Children,ordinal__Catalogs,num__Salary,num__AmountSpent
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,-0.281164,-0.480716
1,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,3.0,0.0,0.0,0.244963,0.105383
2,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,-1.392239,-0.958548
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,3.0,1.0,2.0,0.963893,1.269254
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0,0.40182,0.090809


In [108]:
# Get mappings
onehot_mapping = {
    col: list(categories) for col, categories in zip(nominal_columns, preprocessor.named_transformers_["onehot"].categories_)
}
ordinal_mapping = {
    col: list(categories) for col, categories in zip(ordinal_columns, preprocessor.named_transformers_["ordinal"].categories_)
}


In [109]:
# Retrive mappings


print("OneHot Mapping:", onehot_mapping)
print("Ordinal Mapping:", ordinal_mapping)
print("Age Mapping:", preprocessor.named_transformers_["ordinal_age"].categories_)
print("History Mapping:", preprocessor.named_transformers_["ordinal_history"].categories_)

OneHot Mapping: {'Gender': ['Female', 'Male'], 'OwnHome': ['Own', 'Rent'], 'Married': ['Married', 'Single'], 'Location': ['Close', 'Far']}
Ordinal Mapping: {'Children': [0, 1, 2, 3], 'Catalogs': [6, 12, 18, 24]}
Age Mapping: [array(['Young', 'Middle', 'Old'], dtype=object)]
History Mapping: [array(['None', 'Low', 'Medium', 'High'], dtype=object)]
