In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import seaborn as sns

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle

# -------------------------------
# 1. Load dataset
# -------------------------------
df = pd.read_csv("manufacturing_dataset_1000_samples.csv")

# -------------------------------
# 2. Select target & drop irrelevant columns
# -------------------------------
target = "Day_of_Week"  

# Columns to drop (not useful for predicting shift)
drop_cols = ["Timestamp","Shift"]

X = df.drop(columns=drop_cols + [target])
y = df[target]

# -------------------------------
# 3. Identify numeric & categorical columns
# -------------------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric Columns:", num_cols)
print("Categorical Columns:", cat_cols)
corr = df.corr()

# correlation of each feature with target column
corr_target = corr["Shift"]
print(corr_target.sort_values(ascending=False))
# -------------------------------
# 4. Build Preprocessing Pipeline
# -------------------------------
# KNN imputer for numeric columns
numeric_transformer = Pipeline(steps=[
    ("imputer", KNNImputer(n_neighbors=5))
])

# One-hot encoding for categorical columns
categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

# Combine both
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# -------------------------------
# 5. Final Pipeline with Random Forest
# -------------------------------
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

# -------------------------------
# 6. Train/Test Split & Model Training
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

# -------------------------------
# 7. Evaluate
# -------------------------------
acc = model.score(X_test, y_test)
print(f"✅ Random Forest Classifier Accuracy for Shift Prediction: {acc:.2f}")

# -------------------------------
# 8. Save model as pickle
# -------------------------------
with open("shift_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved as shift_model.pkl")


Numeric Columns: ['Injection_Temperature', 'Injection_Pressure', 'Cycle_Time', 'Cooling_Time', 'Material_Viscosity', 'Ambient_Temperature', 'Machine_Age', 'Operator_Experience', 'Maintenance_Hours', 'Temperature_Pressure_Ratio', 'Total_Cycle_Time', 'Efficiency_Score', 'Machine_Utilization', 'Parts_Per_Hour']
Categorical Columns: ['Machine_Type', 'Material_Grade']
✅ Random Forest Classifier Accuracy for Shift Prediction: 0.14
✅ Model saved as shift_model.pkl


In [5]:
df.head()

Unnamed: 0,Timestamp,Injection_Temperature,Injection_Pressure,Cycle_Time,Cooling_Time,Material_Viscosity,Ambient_Temperature,Machine_Age,Operator_Experience,Maintenance_Hours,Shift,Machine_Type,Material_Grade,Day_of_Week,Temperature_Pressure_Ratio,Total_Cycle_Time,Efficiency_Score,Machine_Utilization,Parts_Per_Hour
0,2023-01-01 00:00:00,221.0,136.0,28.7,13.6,375.5,28.0,3.8,11.2,64,Evening,Type_B,Economy,Thursday,1.625,42.3,0.063,0.51,36.5
1,2023-01-01 01:00:00,213.3,128.9,34.5,14.0,215.8,22.6,6.8,6.3,58,Night,Type_A,Standard,Wednesday,1.655,48.5,0.037,0.389,29.9
2,2023-01-01 02:00:00,222.8,115.9,19.9,9.5,307.0,25.3,4.2,9.6,47,Day,Type_A,Standard,Monday,1.922,29.4,0.061,0.551,56.9
3,2023-01-01 03:00:00,233.3,105.3,39.2,13.1,137.8,26.0,9.2,8.6,49,Evening,Type_A,Premium,Saturday,2.215,52.3,0.054,0.293,31.0
4,2023-01-01 04:00:00,212.2,125.5,45.0,9.9,298.2,23.6,6.2,23.0,49,Night,Type_B,Premium,Monday,1.691,54.9,0.145,0.443,15.0


In [6]:
import pandas as pd

# correlation matrix
corr = df.corr()

# correlation of each feature with target column
corr_target = corr["Shift"]
print(corr_target.sort_values(ascending=False))


ValueError: could not convert string to float: '2023-01-01 00:00:00'