# Data Science Capstone Project: Manufacturing Equipment Output Prediction with Linear Regression

## Problem Statement
You are working as a data analyst for a manufacturing company that operates injection molding machines to produce plastic components. The company wants to optimize production efficiency by predicting the hourly output (number of parts produced per hour) based on various machine operating parameters.

## Steps Covered:
1. Data Loading and Exploration
2. Exploratory Data Analysis (EDA)
3. Data Preprocessing
4. Model Building and Training
5. Model Evaluation
6. Insights and Recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load dataset
df = pd.read_csv('manufacturing_dataset_1000_samples.csv')
print(f"Dataset loaded! Shape: {df.shape}")
df.head()

In [None]:
# Data exploration
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Target variable distribution
plt.figure(figsize=(10, 6))
plt.hist(df['Parts_Per_Hour'], bins=30, edgecolor='black')
plt.title('Distribution of Parts Per Hour')
plt.xlabel('Parts Per Hour')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Select features for modeling
features = [
    'Injection_Temperature', 'Injection_Pressure', 'Cycle_Time',
    'Cooling_Time', 'Material_Viscosity', 'Ambient_Temperature',
    'Machine_Age', 'Operator_Experience', 'Maintenance_Hours'
]
target = 'Parts_Per_Hour'

# Correlation analysis
plt.figure(figsize=(12, 8))
correlation_matrix = df[features + [target]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Top correlations with target
print("Correlations with target:")
print(correlation_matrix[target].sort_values(ascending=False))

In [None]:
# Scatter plots for key features
key_features = ['Cycle_Time', 'Injection_Temperature', 'Injection_Pressure', 'Cooling_Time']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    axes[i].scatter(df[feature], df[target], alpha=0.6)
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel(target)
    axes[i].set_title(f'{feature} vs {target}')
    
    # Add trend line
    try:
        z = np.polyfit(df[feature], df[target], 1)
        p = np.poly1d(z)
        x_range = np.linspace(df[feature].min(), df[feature].max(), 100)
        axes[i].plot(x_range, p(x_range), "r--", alpha=0.8)
    except:
        pass

plt.tight_layout()
plt.show()

In [None]:
# Data preprocessing
df_model = df[features + [target]].copy()

# Handle missing values (if any)
df_model = df_model.dropna()

# Split data
X = df_model[features]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

In [None]:
# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Evaluate model
def evaluate_model(y_true, y_pred, dataset_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Performance:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")
    
    return r2, rmse

train_r2, train_rmse = evaluate_model(y_train, y_train_pred, "Training")
test_r2, test_rmse = evaluate_model(y_test, y_test_pred, "Testing")

In [None]:
# Plot predictions vs actual
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Training set
ax1.scatter(y_train, y_train_pred, alpha=0.6)
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
ax1.set_xlabel('Actual Parts Per Hour')
ax1.set_ylabel('Predicted Parts Per Hour')
ax1.set_title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}')
ax1.grid(True, alpha=0.3)

# Testing set
ax2.scatter(y_test, y_test_pred, alpha=0.6)
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
ax2.set_xlabel('Actual Parts Per Hour')
ax2.set_ylabel('Predicted Parts Per Hour')
ax2.set_title(f'Testing Set: Actual vs Predicted\nR² = {test_r2:.4f}')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_,
    'Absolute_Coefficient': np.abs(model.coef_)
})

feature_importance = feature_importance.sort_values('Absolute_Coefficient', ascending=False)

print("Feature Importance (by absolute coefficient):")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Absolute_Coefficient'])
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance in Linear Regression Model')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Residual analysis
residuals = y_test - y_test_pred

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Residuals vs predicted
ax1.scatter(y_test_pred, residuals, alpha=0.6)
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_xlabel('Predicted Parts Per Hour')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals vs Predicted Values')
ax1.grid(True, alpha=0.3)

# Residual distribution
ax2.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
ax2.set_xlabel('Residuals')
ax2.set_ylabel('Frequency')
ax2.set_title('Distribution of Residuals')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Residuals mean: {residuals.mean():.4f}")
print(f"Residuals std: {residuals.std():.4f}")

## Manufacturing Insights and Recommendations

### Key Findings:
1. **Model Performance**: The linear regression model achieved an R² of {test_r2:.4f} on the test set
2. **Most Important Features**: Based on coefficient analysis
3. **Business Implications**: 

### Recommendations:
1. **Optimize Cycle Time**: Shorter cycle times lead to higher output
2. **Temperature Control**: Maintain optimal injection temperatures
3. **Pressure Management**: Balance injection pressure for efficiency
4. **Maintenance Schedule**: Regular maintenance prevents performance degradation
5. **Operator Training**: Experienced operators improve productivity

In [None]:
# Save model for deployment
import pickle

# Save model and scaler
with open('manufacturing_model.pkl', 'wb') as f:
    pickle.dump({
        'model': model,
        'scaler': scaler,
        'features': features,
        'r2_score': test_r2
    }, f)

print("Model saved as 'manufacturing_model.pkl'")
print("Ready for deployment!")