# Clothes Size Predictor 🧥

## Feature Engineering

In [1]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root))
logger.info(f"✅ Libraries Uploaded")

[INFO] ✅ Libraries Uploaded


In [2]:
# --- Import from /src/pipelines
from src.pipeline.feature_engineering import FeatureEngineer
logger.info(f"✅ Libraries Uploaded")

[INFO] ✅ Libraries Uploaded


## ☛ Import DataSet Processed

In [3]:
# Load the cleaned dataset
file_path = os.path.abspath(os.path.join(project_root, 'data', 'processed', 'clothes_processed.csv'))

# Load the CSV into a DataFrame
try:
    clothes_df = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {clothes_df.shape[0]} rows, {clothes_df.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading data: {e}")

[INFO] ✅ Data successfully loaded: 26351 rows, 4 columns.


In [4]:
clothes_df.head()

Unnamed: 0,weight,age,height,size
0,62,28.0,172.72,XL
1,59,36.0,167.64,L
2,61,34.0,165.1,M
3,65,27.0,175.26,L
4,62,45.0,172.72,M


## Initialize the Feature Engineering

In [8]:
# Initialize
fe = FeatureEngineer(clothes_df, target_col="size")

In [9]:
df_processed = fe.run_all_preprocessing()

🚀 Running essential Pre-processing pipeline (Scaling & Encoding)...
🔢 Encoded target column 'size'.
💾 LabelEncoder saved.
📏 Scaled numeric features: ['weight', 'height', 'age']
💾 StandardScaler saved.
✅ Pre-processing pipeline completed.


---
# TRAINING THE MODEL

In [10]:
# --- Just a quick check
df_processed.head()

Unnamed: 0,weight,age,height,size,size_encoded
0,-0.084191,-0.78447,0.860731,XL,3
1,-0.355933,-0.013125,0.229692,L,0
2,-0.174771,-0.205961,-0.085828,M,1
3,0.187552,-0.880888,1.176251,L,0
4,-0.084191,0.854639,0.860731,M,1


In [11]:
# --- Our Variables
X = df_processed[['weight', 'height', 'age']] # Features
y = df_processed['size_encoded'] # Target

In [12]:
from sklearn.model_selection import train_test_split

# Divide the dataset into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # CRITICAL: Ensures that the size imbalance is maintained in both train and test sets
)

In [13]:
print(f"Training Data: {X_train.shape[0]} rows")
print(f"Testing Data: {X_test.shape[0]} rows")

Training Data: 21080 rows
Testing Data: 5271 rows


### Optimization and Training

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import joblib

# Define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='auto', random_state=42)

# Parameters to tune:
# 'C': Inverse of regularization strength (smaller C = stronger regularization)
# 'class_weight': Addresses size imbalance
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'class_weight': [None, 'balanced']  # Try without weighting and with weighting
}

In [15]:
# We use GridSearchCV to find the best parameter combination
grid_search = GridSearchCV(
    estimator=log_reg, 
    param_grid=param_grid, 
    scoring='f1_weighted', # We use weighted F1-score due to class imbalance
    cv=5,                 # Validation 
    n_jobs=-1             # Use all processor cores
)

In [16]:
print("Iniciating hyperparameter search (GridSearch)...")
grid_search.fit(X_train, y_train)

Iniciating hyperparameter search (GridSearch)...




0,1,2
,estimator,LogisticRegre...r='liblinear')
,param_grid,"{'C': [0.01, 0.1, ...], 'class_weight': [None, 'balanced']}"
,scoring,'f1_weighted'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,100


In [17]:
# 3. Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, '../models/best_model.pkl')
print(f"✅ Best model saved. Best parameters: {grid_search.best_params_}")

✅ Best model saved. Best parameters: {'C': 10, 'class_weight': 'balanced'}


### Final Evaluation

In [18]:
y_pred = best_model.predict(X_test)

In [None]:
print("\n--- Classification Report ---")
# This will show precision, recall, and F1-score for EACH size
print(classification_report(y_test, y_pred, target_names=fe.encoder.classes_))