In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
# Load dataset
# ---------------------------
df = pd.read_csv('../Data/dataset.csv')
print('Original shape:', df.shape)
df.head()

Original shape: (3000, 7)


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,NumWebPurchases,Response
0,88055,2,1,16,588,12,1
1,40843,0,2,39,0,2,0
2,65338,1,0,18,431,13,1
3,47342,2,2,98,116,2,0
4,76079,1,2,5,503,14,1


In [12]:
# 2️⃣ Data Cleaning
# ---------------------------

## (i) Inconsistent Data — Example: negative Income (shouldn’t happen, but let’s fix if any)
df = df[df['Income'] >= 0]

## (ii) Noisy Data / Outliers — Example: clip extreme MntWines values
df['MntWines'] = df['MntWines'].clip(lower=0, upper=df['MntWines'].quantile(0.99))

## (iii) Duplicates — Drop duplicate rows if any
df = df.drop_duplicates()

## (iv) Null Values — Drop or fill missing values
print('Null values before:', df.isnull().sum())
df = df.dropna()
print('Null values after:', df.isnull().sum())

Null values before: Income             0
Kidhome            0
Teenhome           0
Recency            0
MntWines           0
NumWebPurchases    0
Response           0
dtype: int64
Null values after: Income             0
Kidhome            0
Teenhome           0
Recency            0
MntWines           0
NumWebPurchases    0
Response           0
dtype: int64


In [13]:
# 3️⃣ Data Transformation — Scaling
# ---------------------------
X = df.drop('Response', axis=1)
y = df['Response']

# Example Feature Engineering: Add a total children column (Kidhome + Teenhome)
X['TotalChildren'] = X['Kidhome'] + X['Teenhome']

# Example Data Reduction: Remove Teenhome if redundant
X = X.drop(['Teenhome'], axis=1)

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# 5️⃣ Train/Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print('Train:', X_train.shape, 'Test:', X_test.shape)

Train: (2400, 6) Test: (600, 6)


In [15]:
# 6️⃣ Save Preprocessed Data to CSV
# ---------------------------
# Combine scaled features + target for saving
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df['Response'] = y.reset_index(drop=True)

X_scaled_df.to_csv('../Data/preprocessed_customer_personality.csv', index=False)
print('✅ Preprocessed dataset saved as preprocessed_customer_personality.csv')

✅ Preprocessed dataset saved as preprocessed_customer_personality.csv


In [16]:
# Optional: Save Scaler for Flask App
# ---------------------------
import joblib
joblib.dump(scaler, '../App/model/scaler.pkl')
print('✅ Scaler saved for use in your Flask app.')

✅ Scaler saved for use in your Flask app.
