In [1]:
import joblib
import yaml
import os

In [2]:
# Data wrangling
import pandas as pd
import numpy as np

In [3]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['transformedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,% Iron Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 04 Air Flow,Flotation Column 05 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,55.2,3170.41,539.673,399.697,10.1589,1.6691,249.291,248.269,295.096,306.4,...,249.774,462.601,488.724,441.674,433.629,448.477,480.866,489.382,67.06,1.11
1,55.2,3365.65,573.517,399.023,10.086,1.70565,249.379,253.312,295.096,306.4,...,249.06,456.445,440.432,456.625,432.736,464.334,445.95,432.906,66.97,1.27
2,55.2,2693.75,592.133,409.204,9.9488,1.72472,248.302,251.906,295.096,306.4,...,249.236,459.248,480.114,453.814,433.885,438.642,421.974,408.193,66.75,1.36
3,55.2,2352.216,601.807,398.145,9.84375,1.75928,248.95,246.313,295.096,306.4,...,252.686,558.545,548.28,551.96,561.72,543.07,529.26,514.24,66.63,1.34
4,55.2,3313.96,626.099,399.785,9.7471,1.77,248.379,250.532,295.096,306.4,...,251.323,559.346,534.12,539.332,538.596,545.27,575.404,595.68,66.85,1.15


In [4]:
df.describe()

Unnamed: 0,% Iron Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 04 Air Flow,Flotation Column 05 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
count,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,...,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0
mean,56.293024,2833.364762,488.085359,397.500566,9.766777,1.68085,280.064866,277.062984,299.442823,299.926067,...,290.766086,520.452849,523.286713,531.594141,421.575527,426.241166,430.972674,421.619552,65.043068,2.333597
std,5.131583,1221.352011,90.702941,9.644884,0.38455,0.068926,29.598607,29.819333,2.56533,3.580891,...,28.826321,131.107835,128.913286,150.202214,90.477129,83.881945,90.196521,84.578366,1.120171,1.129771
min,45.38,145.77308,247.3343,376.92164,8.754963,1.520103,177.484459,187.280618,294.361643,289.923504,...,195.026,152.346,211.336333,127.120281,162.717796,167.22502,159.87,177.474268,62.05,0.6
25%,52.67,2054.14,432.908,394.186,9.52619,1.64715,250.291,250.541,298.225,298.115828,...,255.86,417.458,441.764,411.138,357.408,358.597,359.397,357.235,64.35,1.44
50%,56.08,2989.89,504.648,399.24,9.80022,1.69824,299.36,296.199,299.809,299.825,...,299.048,491.978,496.497,494.079,412.561,409.507194,426.205,411.116,65.21,2.0
75%,59.72,3724.61,555.143,402.833,10.0368,1.72908,300.146,300.68,300.608,301.764,...,301.891,594.013,595.65,601.435,486.687,483.063,494.31,478.637,65.85,3.01
max,64.03,5504.236,668.04644,417.735479,10.583488,1.789418,303.00076,310.05328,305.379955,306.4,...,371.248,861.600571,828.599547,886.773,679.49,674.076707,698.570061,656.7475,68.01,5.53


In [5]:
# Define features (X) and target variable (y)
X = df.drop('% Silica Concentrate', axis=1)  # Features (all columns except 'Job Offer')
y = df['% Silica Concentrate']  # Target variable

In [6]:
# Load saved model
model_path = '../model/final_model.pkl'  # Update with your actual path
final_model = joblib.load(model_path)
final_model.set_params(verbosity=-1)

0,1,2
,boosting_type,'gbdt'
,num_leaves,141
,max_depth,11
,learning_rate,0.00637425781363651
,n_estimators,1663
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.04105164630493047
,min_child_weight,0.001


In [7]:
# Create a simple prediction function
def predict_silica_concentration(data, model):
    """
    Make predictions using the saved model.

    Parameters:
    data (DataFrame): Input features in the same format as the training data
    model (object): Trained model loaded from disk

    Returns:
    float: Predicted silica concentration percentage
    """
    
    # Ensure data has the same columns as the training data
    required_columns = X.columns
    missing_columns = set(required_columns) - set(data.columns)
    if missing_columns:
        raise ValueError(f"Missing columns: {missing_columns}")

    # Select and order columns
    data = data[required_columns]

    # Make prediction
    prediction = model.predict(data)

    return prediction[0] if len(prediction) == 1 else prediction

In [8]:
print("\n18. PREDICTION EXAMPLE")
print("======================")

# Method 1: Use a random sample from the existing dataset
sample_idx = np.random.randint(0, len(df))
sample_data = X.iloc[[sample_idx]]  # Keep as DataFrame with double brackets

print(f"Sample input data (row {sample_idx}):")
print("-" * 40)
for col in sample_data.columns[:5]:  # Show first 5 features
    print(f"{col}: {sample_data[col].iloc[0]:.2f}")
print("... (and more features)")

# Make prediction
predicted_silica = predict_silica_concentration(sample_data, final_model)
actual_silica = y.iloc[sample_idx]

print(f"\nPrediction Results:")
print("-" * 20)
print(f"Predicted % Silica Concentrate: {predicted_silica:.3f}%")
print(f"Actual % Silica Concentrate: {actual_silica:.3f}%")
print(f"Prediction Error: {abs(predicted_silica - actual_silica):.3f}%")


18. PREDICTION EXAMPLE
Sample input data (row 111):
----------------------------------------
% Iron Feed: 58.19
Starch Flow: 3239.45
Amina Flow: 533.66
Ore Pulp Flow: 396.67
Ore Pulp pH: 9.50
... (and more features)

Prediction Results:
--------------------
Predicted % Silica Concentrate: 2.478%
Actual % Silica Concentrate: 3.480%
Prediction Error: 1.002%


In [9]:
# Method 2: Create a custom example with typical values
print("\n" + "="*50)
print("Custom Example with Typical Operating Conditions:")
print("="*50)

# Create sample data with typical values from dataset statistics
custom_sample = pd.DataFrame({
    '% Iron Feed': [56.0],  # Close to median
    'Starch Flow': [3000.0],  # Close to median  
    'Amina Flow': [500.0],  # Close to median
    'Ore Pulp Flow': [399.0],  # Close to median
    'Ore Pulp pH': [9.8],  # Close to median
    'Ore Pulp Density': [1.7],  # Close to median
})

# Add remaining columns with median values
for col in X.columns:
    if col not in custom_sample.columns:
        custom_sample[col] = [X[col].median()]

print("Custom input conditions:")
print("-" * 25)
key_features = ['% Iron Feed', 'Starch Flow', 'Amina Flow', 'Ore Pulp Flow', 'Ore Pulp pH']
for feature in key_features:
    print(f"{feature}: {custom_sample[feature].iloc[0]:.2f}")

# Make prediction on custom sample
custom_prediction = predict_silica_concentration(custom_sample, final_model)

print(f"\nPredicted % Silica Concentrate: {custom_prediction:.3f}%")


Custom Example with Typical Operating Conditions:
Custom input conditions:
-------------------------
% Iron Feed: 56.00
Starch Flow: 3000.00
Amina Flow: 500.00
Ore Pulp Flow: 399.00
Ore Pulp pH: 9.80

Predicted % Silica Concentrate: 1.893%


In [10]:
# Method 3: Show how to use the function for new plant data
print("\n" + "="*50)
print("How to Use for New Plant Data:")
print("="*50)

# Create a complete example with all features
all_features_example = pd.DataFrame({
    # Primary process parameters
    '% Iron Feed': [58.5],
    'Starch Flow': [2800.0],
    'Amina Flow': [450.0],
    'Ore Pulp Flow': [401.2],
    'Ore Pulp pH': [9.9],
    'Ore Pulp Density': [1.68],
    
    # Flotation Column Air Flows
    'Flotation Column 01 Air Flow': [285.0],
    'Flotation Column 02 Air Flow': [280.0],
    'Flotation Column 03 Air Flow': [295.0],
    'Flotation Column 04 Air Flow': [300.0],
    'Flotation Column 05 Air Flow': [302.0],
    'Flotation Column 06 Air Flow': [298.0],
    'Flotation Column 07 Air Flow': [292.0],
    
    # Flotation Column Levels
    'Flotation Column 01 Level': [520.0],
    'Flotation Column 02 Level': [525.0],
    'Flotation Column 03 Level': [535.0],
    'Flotation Column 04 Level': [420.0],
    'Flotation Column 05 Level': [425.0],
    'Flotation Column 06 Level': [430.0],
    'Flotation Column 07 Level': [422.0],
    
    # Iron Concentrate
    '% Iron Concentrate': [65.2],
})

# Ensure all required features are present
missing_features = set(X.columns) - set(all_features_example.columns)
if missing_features:
    print(f"Adding {len(missing_features)} missing features with median values...")
    for feature in missing_features:
        all_features_example[feature] = [X[feature].median()]

# Reorder columns to match training data
all_features_example = all_features_example[X.columns]

print("Complete example with ALL features:")
print("-" * 35)
print("Main Process Parameters:")
main_features = ['% Iron Feed', 'Starch Flow', 'Amina Flow', 'Ore Pulp Flow', 'Ore Pulp pH', 'Ore Pulp Density']
for feature in main_features:
    print(f"  {feature}: {all_features_example[feature].iloc[0]:.2f}")

print("\nFlotation Column Air Flows:")
air_flow_features = [col for col in X.columns if 'Air Flow' in col]
for feature in air_flow_features:
    print(f"  {feature}: {all_features_example[feature].iloc[0]:.2f}")

print("\nFlotation Column Levels:")
level_features = [col for col in X.columns if 'Level' in col]
for feature in level_features:
    print(f"  {feature}: {all_features_example[feature].iloc[0]:.2f}")

print("\nConcentrate Parameters:")
concentrate_features = [col for col in X.columns if 'Concentrate' in col]
for feature in concentrate_features:
    print(f"  {feature}: {all_features_example[feature].iloc[0]:.2f}")

# Make prediction on complete example
complete_prediction = predict_silica_concentration(all_features_example, final_model)

print(f"\nPredicted % Silica Concentrate: {complete_prediction:.3f}%")


How to Use for New Plant Data:
Complete example with ALL features:
-----------------------------------
Main Process Parameters:
  % Iron Feed: 58.50
  Starch Flow: 2800.00
  Amina Flow: 450.00
  Ore Pulp Flow: 401.20
  Ore Pulp pH: 9.90
  Ore Pulp Density: 1.68

Flotation Column Air Flows:
  Flotation Column 01 Air Flow: 285.00
  Flotation Column 02 Air Flow: 280.00
  Flotation Column 04 Air Flow: 300.00
  Flotation Column 05 Air Flow: 302.00
  Flotation Column 06 Air Flow: 298.00
  Flotation Column 07 Air Flow: 292.00

Flotation Column Levels:
  Flotation Column 01 Level: 520.00
  Flotation Column 02 Level: 525.00
  Flotation Column 03 Level: 535.00
  Flotation Column 04 Level: 420.00
  Flotation Column 05 Level: 425.00
  Flotation Column 06 Level: 430.00
  Flotation Column 07 Level: 422.00

Concentrate Parameters:
  % Iron Concentrate: 65.20

Predicted % Silica Concentrate: 1.931%
