# Production Planning - Material Yield Prediction System

## SAP Manufacturing Logic
- **101** = INPUT material consumption (raw materials, BFIN)
- **261** = OUTPUT material production (finished goods, BFOUT derived from dimensions)
- Input and Output materials are **DIFFERENT**
- Join **ONLY** on `MANUFACTURINGORDER`
- `Yield = Total_Output_BF / Total_Input_BF`

## Real-World Use Cases
1. **Forward Planning**: "If I consume X BF of raw material, how much output will I get?"
2. **Reverse Planning**: "If I need Y BF of finished goods, how much raw material do I need?"
3. **Material Selection**: "Which raw material gives the best yield for my needs?"
4. **Anomaly Detection**: "Is this manufacturing order producing abnormal loss?"

## 1. Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Plot settings
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Imports successful!")

ModuleNotFoundError: No module named 'pandas'

## 2. Load and Clean Data

In [None]:
# Load raw data
df_101 = pd.read_csv('../101.csv')
df_261 = pd.read_csv('../261.csv')

print(f"101.csv (Inputs): {df_101.shape[0]:,} rows, {df_101.shape[1]} columns")
print(f"261.csv (Outputs): {df_261.shape[0]:,} rows, {df_261.shape[1]} columns")

In [None]:
# Inspect data structure
print("=== 101.csv (Input Materials) ===")
print(df_101.dtypes)
print(f"\nMissing values:\n{df_101.isnull().sum()}")
print(f"\n=== 261.csv (Output Materials) ===")
print(df_261.dtypes)
print(f"\nMissing values:\n{df_261.isnull().sum()}")

### 2.1 Data Cleaning Steps
1. Remove deleted records (`is_deleted = TRUE`)
2. Convert data types (dates, numerics)
3. Handle missing values
4. Remove duplicates
5. Validate data quality

In [None]:
def clean_dataframe(df, name):
    """Clean a single dataframe with standard preprocessing steps."""
    print(f"\n{'='*50}")
    print(f"Cleaning {name}")
    print(f"{'='*50}")
    
    initial_rows = len(df)
    
    # 1. Remove deleted records
    # Handle both boolean and string representations
    if 'is_deleted' in df.columns:
        df['is_deleted'] = df['is_deleted'].astype(str).str.upper()
        deleted_count = (df['is_deleted'] == 'TRUE').sum()
        df = df[df['is_deleted'] != 'TRUE'].copy()
        print(f"1. Removed {deleted_count:,} deleted records")
    
    # 2. Convert POSTINGDATE to datetime
    if 'POSTINGDATE' in df.columns:
        df['POSTINGDATE'] = pd.to_datetime(df['POSTINGDATE'], errors='coerce')
        invalid_dates = df['POSTINGDATE'].isnull().sum()
        print(f"2. Converted POSTINGDATE to datetime ({invalid_dates} invalid dates)")
    
    # 3. Convert numeric columns
    numeric_cols = ['MATERIALTHICKNESS', 'TALLYLENGTH', 'TALLYWIDTH', 'BFIN', 'BFOUT']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    print(f"3. Converted numeric columns: {[c for c in numeric_cols if c in df.columns]}")
    
    # 4. Handle missing values in key columns
    key_cols = ['MANUFACTURINGORDER', 'PLANT', 'MATERIAL']
    missing_key = df[key_cols].isnull().any(axis=1).sum()
    df = df.dropna(subset=key_cols)
    print(f"4. Removed {missing_key:,} rows with missing key columns")
    
    # 5. Remove duplicates
    dup_count = df.duplicated().sum()
    df = df.drop_duplicates()
    print(f"5. Removed {dup_count:,} duplicate rows")
    
    # 6. Remove rows with negative BF values
    bf_cols = ['BFIN', 'BFOUT']
    for col in bf_cols:
        if col in df.columns:
            negative_count = (df[col] < 0).sum()
            df = df[df[col] >= 0]
            if negative_count > 0:
                print(f"6. Removed {negative_count:,} rows with negative {col}")
    
    final_rows = len(df)
    print(f"\nResult: {initial_rows:,} -> {final_rows:,} rows ({initial_rows - final_rows:,} removed, {(final_rows/initial_rows)*100:.1f}% retained)")
    
    return df

# Apply cleaning to both datasets
df_101_clean = clean_dataframe(df_101, "101.csv (Inputs)")
df_261_clean = clean_dataframe(df_261, "261.csv (Outputs)")

In [None]:
# Data quality summary
print("=== Data Quality Summary After Cleaning ===\n")

print("101.csv (Inputs):")
print(f"  Rows: {len(df_101_clean):,}")
print(f"  Unique Manufacturing Orders: {df_101_clean['MANUFACTURINGORDER'].nunique():,}")
print(f"  Unique Materials: {df_101_clean['MATERIAL'].nunique():,}")
print(f"  Unique Plants: {df_101_clean['PLANT'].nunique():,}")
print(f"  Date Range: {df_101_clean['POSTINGDATE'].min()} to {df_101_clean['POSTINGDATE'].max()}")

print("\n261.csv (Outputs):")
print(f"  Rows: {len(df_261_clean):,}")
print(f"  Unique Manufacturing Orders: {df_261_clean['MANUFACTURINGORDER'].nunique():,}")
print(f"  Unique Materials: {df_261_clean['MATERIAL'].nunique():,}")
print(f"  Unique Plants: {df_261_clean['PLANT'].nunique():,}")
print(f"  Date Range: {df_261_clean['POSTINGDATE'].min()} to {df_261_clean['POSTINGDATE'].max()}")

In [None]:
# Preview cleaned data
print("=== Sample of Cleaned 101 Data (Inputs) ===")
display(df_101_clean.head(10))

print("\n=== Sample of Cleaned 261 Data (Outputs) ===")
display(df_261_clean.head(10))

## 3. Random Forest Yield Prediction

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# First, merge input and output data to calculate yield
# Aggregate input data by manufacturing order
input_agg = df_101_clean.groupby('MANUFACTURINGORDER').agg({
    'BFIN': 'sum',
    'PLANT': 'first',
    'MATERIAL': 'first',
    'MATERIALTHICKNESS': 'first'
}).reset_index()
input_agg.columns = ['MANUFACTURINGORDER', 'Total_Input_BF', 'Plant', 'Input_Material', 'Thickness']

# Aggregate output data by manufacturing order
output_agg = df_261_clean.groupby('MANUFACTURINGORDER').agg({
    'BFOUT': 'sum'
}).reset_index()
output_agg.columns = ['MANUFACTURINGORDER', 'Total_Output_BF']

# Merge input and output
df = input_agg.merge(output_agg, on='MANUFACTURINGORDER', how='inner')

# Calculate yield percentage
df['Yield_Percentage'] = (df['Total_Output_BF'] / df['Total_Input_BF']) * 100

# Filter out unrealistic yields (e.g., > 100% or negative)
df = df[(df['Yield_Percentage'] > 0) & (df['Yield_Percentage'] <= 100)]

print(f"Merged dataset: {len(df):,} manufacturing orders")
print(f"Yield range: {df['Yield_Percentage'].min():.1f}% - {df['Yield_Percentage'].max():.1f}%")
print(f"Mean yield: {df['Yield_Percentage'].mean():.1f}%")
display(df.head())

In [None]:
# Encode categorical features
le_plant = LabelEncoder()
le_material = LabelEncoder()

df['Plant_Encoded'] = le_plant.fit_transform(df['Plant'].astype(str))
df['Material_Encoded'] = le_material.fit_transform(df['Input_Material'].astype(str))

# Prepare features and target
feature_cols = ['Total_Input_BF', 'Plant_Encoded', 'Material_Encoded', 'Thickness']
X = df[feature_cols].copy()
y = df['Yield_Percentage'].copy()

# Handle missing values
X = X.fillna(X.median())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Features: {feature_cols}")
print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

In [None]:
# Train Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training complete!")

# Predictions
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

# Cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2')
print(f"\nCross-validation R² scores: {cv_scores.round(4)}")
print(f"Mean CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:
# Model Evaluation
print("=" * 50)
print("RANDOM FOREST MODEL EVALUATION")
print("=" * 50)

# Training metrics
train_r2 = r2_score(y_train, y_pred_train)
train_mae = mean_absolute_error(y_train, y_pred_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))

# Test metrics
test_r2 = r2_score(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"\nTraining Set:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  MAE: {train_mae:.2f}%")
print(f"  RMSE: {train_rmse:.2f}%")

print(f"\nTest Set:")
print(f"  R² Score: {test_r2:.4f}")
print(f"  MAE: {test_mae:.2f}%")
print(f"  RMSE: {test_rmse:.2f}%")

In [None]:
# Feature Importance
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
display(importance_df)

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
ax.barh(importance_df['Feature'], importance_df['Importance'], color='forestgreen')
ax.set_xlabel('Importance')
ax.set_title('Random Forest Feature Importance')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Actual vs Predicted Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred_test, alpha=0.5, color='forestgreen')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect')
axes[0].set_xlabel('Actual Yield %')
axes[0].set_ylabel('Predicted Yield %')
axes[0].set_title('Actual vs Predicted Yield')
axes[0].legend()

# Error distribution
errors = y_test - y_pred_test
axes[1].hist(errors, bins=30, color='forestgreen', edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--')
axes[1].set_xlabel('Prediction Error (%)')
axes[1].set_ylabel('Frequency')
axes[1].set_title(f'Error Distribution (Mean: {errors.mean():.2f}%)')

plt.tight_layout()
plt.show()

In [None]:
# Function to predict yield for new inputs
def predict_yield(input_bf, plant, material, thickness):
    """Predict yield percentage for given inputs."""
    # Encode plant and material
    try:
        plant_enc = le_plant.transform([str(plant)])[0]
    except ValueError:
        plant_enc = 0  # Unknown plant
    
    try:
        material_enc = le_material.transform([str(material)])[0]
    except ValueError:
        material_enc = 0  # Unknown material
    
    # Create input array
    X_new = pd.DataFrame([[input_bf, plant_enc, material_enc, thickness]], 
                         columns=feature_cols)
    
    # Predict
    predicted_yield = rf_model.predict(X_new)[0]
    expected_output = input_bf * predicted_yield / 100
    
    return {
        'input_bf': input_bf,
        'predicted_yield_pct': predicted_yield,
        'expected_output_bf': expected_output
    }

# Example prediction
example = predict_yield(
    input_bf=10000,
    plant=df['Plant'].iloc[0],
    material=df['Input_Material'].iloc[0],
    thickness=df['Thickness'].iloc[0]
)

print("Example Prediction:")
print(f"  Input: {example['input_bf']:,} BF")
print(f"  Predicted Yield: {example['predicted_yield_pct']:.1f}%")
print(f"  Expected Output: {example['expected_output_bf']:,.0f} BF")