# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


# sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb

from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

import sys
from pathlib import Path
root = Path().resolve().parent
sys.path.append(str(root))

# plotting style
sns.set_theme(style="whitegrid")
%matplotlib inline

# Load the data

In [2]:
data=pd.read_csv('../data/Low-alloy-steel-data-cleaned.csv')
data.rename(columns={'Unnamed: 0':'id'},inplace=True)
data.set_index('id',inplace=True)
data

Unnamed: 0_level_0,alloy_id,C,Si,Mn,P,S,Ni,Cr,Mo,Cu,V,Al,N,Ceq,Nb + Ta,Temperature_C,0.2%_Proof_Stress_MPa,Tensile_Strength_MPa,Elongation_%,Reduction_in_Area_%
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.610,0.04,0.000,0.003,0.0066,0.0,0.0000,27,342,490,30,71
1,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.610,0.04,0.000,0.003,0.0066,0.0,0.0000,100,338,454,27,72
2,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.610,0.04,0.000,0.003,0.0066,0.0,0.0000,200,337,465,23,69
3,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.610,0.04,0.000,0.003,0.0066,0.0,0.0000,300,346,495,21,70
4,MBB,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.610,0.04,0.000,0.003,0.0066,0.0,0.0000,400,316,489,26,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,CCB,0.22,0.22,1.24,0.021,0.008,0.030,0.05,0.017,0.01,0.005,0.005,0.0116,0.0,0.0017,350,268,632,28,65
911,CCB,0.22,0.22,1.24,0.021,0.008,0.030,0.05,0.017,0.01,0.005,0.005,0.0116,0.0,0.0017,400,244,575,28,68
912,CCB,0.22,0.22,1.24,0.021,0.008,0.030,0.05,0.017,0.01,0.005,0.005,0.0116,0.0,0.0017,450,224,500,29,72
913,CCB,0.22,0.22,1.24,0.021,0.008,0.030,0.05,0.017,0.01,0.005,0.005,0.0116,0.0,0.0017,500,209,428,30,78


In [3]:
data.info()
data.columns

<class 'pandas.core.frame.DataFrame'>
Index: 915 entries, 0 to 914
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   alloy_id               915 non-null    object 
 1   C                      915 non-null    float64
 2   Si                     915 non-null    float64
 3   Mn                     915 non-null    float64
 4   P                      915 non-null    float64
 5   S                      915 non-null    float64
 6   Ni                     915 non-null    float64
 7   Cr                     915 non-null    float64
 8   Mo                     915 non-null    float64
 9   Cu                     915 non-null    float64
 10  V                      915 non-null    float64
 11  Al                     915 non-null    float64
 12  N                      915 non-null    float64
 13  Ceq                    915 non-null    float64
 14  Nb + Ta                915 non-null    float64
 15  Temperature

Index(['alloy_id', 'C', 'Si', 'Mn', 'P', 'S', 'Ni', 'Cr', 'Mo', 'Cu', 'V',
       'Al', 'N', 'Ceq', 'Nb + Ta', 'Temperature_C', '0.2%_Proof_Stress_MPa',
       'Tensile_Strength_MPa', 'Elongation_%', 'Reduction_in_Area_%'],
      dtype='object')

# Define features and targets

In [4]:
features = ['C','Mn','Cr','Ni','Al','Temperature_C']
targets = ['Tensile_Strength_MPa','Elongation_%']

# Train/Test Split

In [5]:
train_df, test_df = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data['alloy_id']
)

X_train = train_df[features]
y_train = train_df[targets]

X_test = test_df[features]
y_test = test_df[targets]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (732, 6), Test shape: (183, 6)


# Feature Engineering

In [6]:
from utils.feature_engineering import enhanced_features
# def enhanced_features(X):
#     X_new = X.copy()
#     X_new['Temperature_C_squared'] = X_new[features[-1]] ** 2
#     for elem in features[0:-1]:
#         X_new[f'Temperature_C_x_{elem}'] = X_new[features[-1]] * X_new[elem]
#     return X_new

# Preprocessing

In [7]:
numeric_features = features
preprocessor = Pipeline([
    ('enhanced', FunctionTransformer(enhanced_features)),
    ('scaler', StandardScaler())
])

X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=train_df['alloy_id']
)

# Checking the preprocessing transformer
try:
    preprocessor.fit_transform(X_train)
    print("Transformer working correctly")
except Exception as e:
    print('Error!!',{e})

Transformer working correctly


# Model Pipelines (Baseline)

In [8]:
pipelines={}

for target in targets:
    for name in ['Ridge','RandomForest','XGBoost']:
        # Create fresh model for each target
        if name=='Ridge':
            base_model = Ridge(random_state=42)
        elif name=='RandomForest':
            base_model = RandomForestRegressor(random_state=42, n_jobs=-1)
        else:
            base_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
        model_pipe = Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', base_model)
        ])
        
        pipelines[f'{name}_{target}_pipeline']=model_pipe

pipelines

{'Ridge_Tensile_Strength_MPa_pipeline': Pipeline(steps=[('preprocessing',
                  Pipeline(steps=[('enhanced',
                                   FunctionTransformer(func=<function enhanced_features at 0x000001BE9CBD5120>)),
                                  ('scaler', StandardScaler())])),
                 ('regressor', Ridge(random_state=42))]),
 'RandomForest_Tensile_Strength_MPa_pipeline': Pipeline(steps=[('preprocessing',
                  Pipeline(steps=[('enhanced',
                                   FunctionTransformer(func=<function enhanced_features at 0x000001BE9CBD5120>)),
                                  ('scaler', StandardScaler())])),
                 ('regressor',
                  RandomForestRegressor(n_jobs=-1, random_state=42))]),
 'XGBoost_Tensile_Strength_MPa_pipeline': Pipeline(steps=[('preprocessing',
                  Pipeline(steps=[('enhanced',
                                   FunctionTransformer(func=<function enhanced_features at 0x000001BE9CBD

Quick cross-validation (5-fold) to get baseline scores

In [9]:
# 5-fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

cv_results={}

for name,pipeline in pipelines.items():
    target='_'.join(name.split('_')[1:-1])

    r2_scores = cross_val_score(pipeline, X_train, y_train[target], cv=kf, scoring='r2', n_jobs=-1)
    rmse_scores = cross_val_score(pipeline, X_train, y_train[target], cv=kf, scoring=rmse_scorer, n_jobs=-1)

    cv_results[name] = {
        'mean_r2': np.mean(r2_scores),
        'std_r2': np.std(r2_scores),
        'mean_rmse': -np.mean(rmse_scores),
        'std_rmse': np.std(rmse_scores)
    }

In [11]:
print('Summary:\n')
for name,result in cv_results.items():
    print(name,':')
    for n,m in result.items():
        print(n,':',m)
    print('\n')

Summary:

Ridge_Tensile_Strength_MPa_pipeline :
mean_r2 : 0.5522216500564747
std_r2 : 0.2481273323068255
mean_rmse : 168.88032003281003
std_rmse : 195.96296749615053


RandomForest_Tensile_Strength_MPa_pipeline :
mean_r2 : 0.607781042195522
std_r2 : 0.3741783272255972
mean_rmse : 155.71790801918087
std_rmse : 203.1511068219531


XGBoost_Tensile_Strength_MPa_pipeline :
mean_r2 : -2.4639719247817995
std_r2 : 6.372272330638711
mean_rmse : 230.71904983520508
std_rmse : 242.53823730269136


Ridge_Elongation_%_pipeline :
mean_r2 : 0.6040748630776619
std_r2 : 0.0189178545053521
mean_rmse : 5.466361236626415
std_rmse : 0.2730916933902691


RandomForest_Elongation_%_pipeline :
mean_r2 : 0.8226119865436026
std_r2 : 0.010552279341031869
mean_rmse : 3.667626538403378
std_rmse : 0.3280997119064592


XGBoost_Elongation_%_pipeline :
mean_r2 : 0.8099064469337464
std_r2 : 0.036269556752082516
mean_rmse : 3.7653337955474853
std_rmse : 0.311045058721928




# Train on training set and evaluate on validation set

In [12]:
val_results = {}

for pipe_name, pipeline in pipelines.items():
    target = '_'.join(pipe_name.split('_')[1:-1])
    y_tr_target = y_train[target]
    y_val_target = y_val[target]
    
    # Fit the pipeline on training data
    pipeline.fit(X_train, y_tr_target)
    
    # Predict on validation set
    y_pred = pipeline.predict(X_val)
    
    # Evaluate
    r2 = r2_score(y_val_target, y_pred)
    rmse = root_mean_squared_error(y_val_target, y_pred)
    mae = mean_absolute_error(y_val_target, y_pred)
    
    val_results[pipe_name] = {
        'R2': r2,
        'RMSE': rmse,
        'MAE': mae
    }

In [13]:
print('Summary:\n')
for name,result in val_results.items():
    print(name,':')
    for n,m in result.items():
        print(n,':',m)
    print('\n')

Summary:

Ridge_Tensile_Strength_MPa_pipeline :
R2 : 0.7427332522010399
RMSE : 64.43301951537855
MAE : 50.458723026743655


RandomForest_Tensile_Strength_MPa_pipeline :
R2 : -0.0710770448787621
RMSE : 131.47010558261815
MAE : 36.55952380952381


XGBoost_Tensile_Strength_MPa_pipeline :
R2 : -9.453086853027344
RMSE : 410.7134704589844
MAE : 58.559391021728516


Ridge_Elongation_%_pipeline :
R2 : 0.5145328021799332
RMSE : 6.232994975276668
MAE : 4.267983107720646


RandomForest_Elongation_%_pipeline :
R2 : 0.7439243223238181
RMSE : 4.526900998813783
MAE : 2.6197278911564625


XGBoost_Elongation_%_pipeline :
R2 : 0.8149605989456177
RMSE : 3.848123073577881
MAE : 2.323936939239502




# Hyperparameter Tuning for all 6 pipelines

In [14]:
ridge_params = {'regressor__alpha': [0.1, 1.0, 10.0, 50.0, 100.0]}
rf_params = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}
xgb_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__subsample': [0.8, 1.0]
}

# Store tuned pipelines and results
tuned_pipelines = {}
tuned_results = {}

for pipe_name, pipeline in pipelines.items():
    target = '_'.join(pipe_name.split('_')[1:-1])
    y_tr_target = y_train[target]
    y_val_target = y_val[target]

    print(f"\n=== Hyperparameter tuning for {pipe_name} ===")
    
    if 'Ridge' in pipe_name:
        grid = ridge_params
    elif 'RandomForest' in pipe_name:
        grid = rf_params
    else:
        grid = xgb_params
    
    search = GridSearchCV(pipeline, grid, cv=5, scoring='r2', n_jobs=-1)
    
    # Fit search
    search.fit(X_train, y_tr_target)
    
    # Best model
    best_pipeline = search.best_estimator_
    tuned_pipelines[pipe_name] = best_pipeline
    
    # Predict on validation
    y_pred = best_pipeline.predict(X_val)
    r2 = r2_score(y_val_target, y_pred)
    rmse = root_mean_squared_error(y_val_target, y_pred)
    mae = mean_absolute_error(y_val_target, y_pred)
    
    tuned_results[pipe_name] = {
        'R2': r2,
        'RMSE': rmse,
        'MAE': mae,
        'best_params': search.best_params_
    }
    
    print(f"{pipe_name} -> R2: {r2:.3f}, RMSE: {rmse:.3f}, MAE: {mae:.3f}")
    print(f"Best params: {search.best_params_}")


=== Hyperparameter tuning for Ridge_Tensile_Strength_MPa_pipeline ===
Ridge_Tensile_Strength_MPa_pipeline -> R2: 0.734, RMSE: 65.544, MAE: 50.628
Best params: {'regressor__alpha': 10.0}

=== Hyperparameter tuning for RandomForest_Tensile_Strength_MPa_pipeline ===
RandomForest_Tensile_Strength_MPa_pipeline -> R2: 0.035, RMSE: 124.759, MAE: 38.990
Best params: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}

=== Hyperparameter tuning for XGBoost_Tensile_Strength_MPa_pipeline ===
XGBoost_Tensile_Strength_MPa_pipeline -> R2: -3.067, RMSE: 256.194, MAE: 63.025
Best params: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__subsample': 1.0}

=== Hyperparameter tuning for Ridge_Elongation_%_pipeline ===
Ridge_Elongation_%_pipeline -> R2: 0.515, RMSE: 6.233, MAE: 4.268
Best params: {'regressor__alpha': 1.0}

=== Hyperparameter tuning for RandomForest_Elongation_%_pipeline ===
RandomForest_E

In [15]:
print("\nTuned validation results summary:")
for name,result in val_results.items():
    print(name,':')
    for n,m in result.items():
        print(n,':',m)
    print('\n')


Tuned validation results summary:
Ridge_Tensile_Strength_MPa_pipeline :
R2 : 0.7427332522010399
RMSE : 64.43301951537855
MAE : 50.458723026743655


RandomForest_Tensile_Strength_MPa_pipeline :
R2 : -0.0710770448787621
RMSE : 131.47010558261815
MAE : 36.55952380952381


XGBoost_Tensile_Strength_MPa_pipeline :
R2 : -9.453086853027344
RMSE : 410.7134704589844
MAE : 58.559391021728516


Ridge_Elongation_%_pipeline :
R2 : 0.5145328021799332
RMSE : 6.232994975276668
MAE : 4.267983107720646


RandomForest_Elongation_%_pipeline :
R2 : 0.7439243223238181
RMSE : 4.526900998813783
MAE : 2.6197278911564625


XGBoost_Elongation_%_pipeline :
R2 : 0.8149605989456177
RMSE : 3.848123073577881
MAE : 2.323936939239502




Tensile Strength → Ridge

Elongation → XGBoost

# Further Hyperparameter Tuning

### Ridge + PolynomialFeatures for Tensile Strength: New Pipeline
### XGBoost for Elongation: reuse the previously tuned pipeline from tuned_pipelines

In [16]:
from sklearn.preprocessing import PolynomialFeatures

# New Pipeline: Tensile Strength: Ridge + Polynomial Features
ts_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    ('regressor', Ridge(random_state=42))
])

ts_param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 50]
}

ts_search = GridSearchCV(ts_pipeline, ts_param_grid, cv=5, scoring='r2', n_jobs=-1)
ts_search.fit(X_train, y_train['Tensile_Strength_MPa'])

ts_pipeline=ts_search.best_estimator_

# Evaluate
y_pred_ts = ts_pipeline.predict(X_val)
print("Tensile Strength - Best alpha:", ts_search.best_params_)
print("Validation R2:", r2_score(y_val['Tensile_Strength_MPa'], y_pred_ts))
print("Validation RMSE:", root_mean_squared_error(y_val['Tensile_Strength_MPa'], y_pred_ts))
print("Validation MAE:", mean_absolute_error(y_val['Tensile_Strength_MPa'], y_pred_ts))



# Further tuning of Elongation: XGBoost
el_pipeline_prev = tuned_pipelines['XGBoost_Elongation_%_pipeline']

el_param_grid = {
    'regressor__n_estimators': [200, 300],
    'regressor__max_depth': [5, 6],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0],
    'regressor__min_child_weight': [1, 3]
}

el_search = GridSearchCV(el_pipeline_prev, el_param_grid, cv=5, scoring='r2', n_jobs=-1)
el_search.fit(X_train, y_train['Elongation_%'])
el_pipeline=el_search.best_estimator_

# Evaluate
y_pred_el = el_pipeline.predict(X_val)
print("Elongation - Best params:", el_search.best_params_)
print("Validation R2:", r2_score(y_val['Elongation_%'], y_pred_el))
print("Validation RMSE:", root_mean_squared_error(y_val['Elongation_%'], y_pred_el))
print("Validation MAE:", mean_absolute_error(y_val['Elongation_%'], y_pred_el))

Tensile Strength - Best alpha: {'regressor__alpha': 50}
Validation R2: 0.7845636634501386
Validation RMSE: 58.96253316783718
Validation MAE: 42.90532009356876
Elongation - Best params: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__min_child_weight': 3, 'regressor__n_estimators': 300, 'regressor__subsample': 0.8}
Validation R2: 0.8121652007102966
Validation RMSE: 3.8770809173583984
Validation MAE: 2.2123355865478516


Ridge_Tensile_Strength_MPa_pipeline :
- R2 : 0.7427332522010399
- RMSE : 64.43301951537855
- MAE : 50.458723026743655

New Pipeline:
- R2: 0.7845636634501386
- RMSE: 58.96253316783718
- MAE: 42.90532009356876


XGBoost_Elongation_%_pipeline :
- R2 : 0.8149605989456177
- RMSE : 3.848123073577881
- MAE : 2.323936939239502

Further tuned:
- R2: 0.8121652007102966
- RMSE: 3.8770809173583984
- MAE: 2.2123355865478516

So we are choosing these 2 models as final

# Save models(pipelines)

In [17]:
# Save Ridge pipeline for Tensile Strength
joblib.dump(ts_pipeline, "../models/Ridge_TS_poly_final.joblib")
print("Ridge_TS_poly_final.joblib saved successfully!")

# Save XGBoost pipeline for Elongation
joblib.dump(el_pipeline, "../models/XGB_Elongation_final.joblib")
print("XGB_Elongation_final.joblib saved successfully!")

Ridge_TS_poly_final.joblib saved successfully!
XGB_Elongation_final.joblib saved successfully!


# Create predictions for `X_test` and export test dataframe with predicted values as csv file

In [18]:
from utils.model_eval import evaluate_model
# def evaluate_model(y_true, y_pred, name):
#     r2 = r2_score(y_true, y_pred)
#     rmse = root_mean_squared_error(y_true, y_pred)
#     mae = mean_absolute_error(y_true, y_pred)
#     print(f"\n{name}:")
#     print(f"R2: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")
#     return r2, rmse, mae

# Load saved models
ridge_ts_model = joblib.load("../models/Ridge_TS_poly_final.joblib")
xgb_el_model = joblib.load("../models/XGB_Elongation_final.joblib")

# Predict on test set
y_pred_ts = ridge_ts_model.predict(X_test)
y_pred_el = xgb_el_model.predict(X_test)

# Add predictions to dataframe
test_df["Predicted_Tensile_Strength_MPa"] = y_pred_ts
test_df["Predicted_Elongation_%"] = y_pred_el

# Evaluate on test data
evaluate_model(y_test["Tensile_Strength_MPa"], y_pred_ts, "Tensile Strength")
evaluate_model(y_test["Elongation_%"], y_pred_el, "Elongation")

# Save predictions
test_df.to_csv("../data/predictions_test.csv", index=False)
print("Predictions saved to predictions_test.csv")




Tensile Strength:
R2: 0.7509, RMSE: 63.9535, MAE: 45.1348

Elongation:
R2: 0.8696, RMSE: 3.1929, MAE: 2.2505
Predictions saved to predictions_test.csv
