In [11]:
# eda_preprocessing.ipynb

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib




In [12]:
# 1. Load the dataset
print("Loading data...")
try:
    # Use 'low_memory=False' for potentially large or complex files, ensuring correct dtype inference
    df = pd.read_csv('../data/manufacturing_dataset_1000_samples.csv', low_memory=False)
    print(f"Data loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the file path.")
    exit()

Loading data...
Data loaded successfully. Shape: (1000, 19)


In [13]:
# 2. Preprocessing Steps

# Define the target and columns to drop
target_col = 'Parts_Per_Hour'
columns_to_drop = ['Timestamp']

# CRITICAL FIX 1: Drop unnecessary columns from the DataFrame immediately
columns_to_drop = [col for col in columns_to_drop if col in df.columns]
if columns_to_drop:
    df.drop(columns=columns_to_drop, inplace=True)
    print(f"\nDropped columns: {columns_to_drop}")

# Define features and target using the CLEANED DataFrame
features = df.drop(columns=target_col, errors='ignore')
target = df[target_col]

# Identify columns based on their data types from the CLEANED features
# Note: Dtypes are inferred automatically, ensuring all numeric/categorical features are captured
numeric_features = features.select_dtypes(include=np.number).columns.tolist()
categorical_features = features.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\nNumeric Features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

# Preprocessing Pipelines
# Numerical pipeline: impute missing values and then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: encode categorical features
# CRITICAL FIX 2: Use sparse_output=True to prevent kernel death due to memory
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    # CRITICAL FIX 3: Use remainder='drop' to ensure only defined columns are processed
    remainder='drop'
)

# Fit and Save the Preprocessor
print("\nFitting preprocessor (handles imputation, scaling, and OHE)...")
preprocessor.fit(features)
print("Preprocessor fit successfully.")

joblib.dump(preprocessor, 'preprocessor.pkl')
print("Preprocessor saved as 'preprocessor.pkl'.")

# Apply the preprocessor to transform the features
print("\nTransforming features...")
# X_processed is now a sparse matrix (or a NumPy array if the data is all numeric)
X_processed = preprocessor.transform(features)

print("Preprocessing complete.")
print(f"\nShape of processed feature data: {X_processed.shape}")
# Note: We avoid converting X_processed to a dense DataFrame here to prevent crashes.

print("\nEDA and preprocessing complete. Data is ready for model training.")


Dropped columns: ['Timestamp']

Numeric Features (13): ['Injection_Temperature', 'Injection_Pressure', 'Cycle_Time', 'Cooling_Time', 'Material_Viscosity', 'Ambient_Temperature', 'Machine_Age', 'Operator_Experience', 'Maintenance_Hours', 'Temperature_Pressure_Ratio', 'Total_Cycle_Time', 'Efficiency_Score', 'Machine_Utilization']
Categorical Features (4): ['Shift', 'Machine_Type', 'Material_Grade', 'Day_of_Week']

Fitting preprocessor (handles imputation, scaling, and OHE)...
Preprocessor fit successfully.
Preprocessor saved as 'preprocessor.pkl'.

Transforming features...
Preprocessing complete.

Shape of processed feature data: (1000, 29)

EDA and preprocessing complete. Data is ready for model training.


In [14]:
# Final cell in eda_preprocessing.ipynb - ROBUST FILE MOVEMENT

import os
import shutil

# --- 1. Find the true project root directory ---
# This logic searches upward from the notebook's location until it finds a marker file (like requirements.txt)
# to reliably establish the project root, ignoring redundant paths like 'backend/'.
current_dir = os.getcwd()
PROJECT_ROOT = current_dir
while not os.path.exists(os.path.join(PROJECT_ROOT, 'requirements.txt')) and os.path.dirname(PROJECT_ROOT) != PROJECT_ROOT:
    PROJECT_ROOT = os.path.dirname(PROJECT_ROOT)

# If the root still seems wrong, default back one step (standard for notebooks in a subfolder)
if os.path.dirname(current_dir) == current_dir or 'notebooks' in current_dir:
    PROJECT_ROOT = os.path.dirname(current_dir) 

NOTEBOOKS_DIR = current_dir

# --- 2. Define Paths based on the reliable root ---
# The source is where the notebook saved the file (its own directory)
SOURCE_PATH = os.path.join(NOTEBOOKS_DIR, 'preprocessor.pkl') 

# The destination is calculated from the true PROJECT_ROOT
DESTINATION_PATH = os.path.join(PROJECT_ROOT, 'models', 'preprocessor.pkl')

print(f"\nDetermined Project Root: {PROJECT_ROOT}")
print(f"Attempting to move file from: {SOURCE_PATH}")
print(f"Attempting to move file to: {DESTINATION_PATH}")

# --- 3. Execute Move ---
try:
    # Ensure the destination folder exists before moving
    os.makedirs(os.path.dirname(DESTINATION_PATH), exist_ok=True)
    
    # Move the file
    shutil.move(SOURCE_PATH, DESTINATION_PATH)
    
    print(f"\nSuccessfully moved preprocessor.pkl to {os.path.join('backend', 'models', 'preprocessor.pkl')}.")

except FileNotFoundError:
    print(f"\nError: preprocessor.pkl not found at {SOURCE_PATH}.")
    print("Please ensure the file was created in the same directory as the notebook.")
except Exception as e:
    print(f"\nAn unexpected error occurred during move: {e}")


Determined Project Root: c:\Users\LENOVO\OneDrive\Desktop\injection_molding_predictor\backend
Attempting to move file from: c:\Users\LENOVO\OneDrive\Desktop\injection_molding_predictor\backend\notebooks\preprocessor.pkl
Attempting to move file to: c:\Users\LENOVO\OneDrive\Desktop\injection_molding_predictor\backend\models\preprocessor.pkl

Successfully moved preprocessor.pkl to backend\models\preprocessor.pkl.
