In [None]:
!pip install catboost category_encoders

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import catboost as cb
import joblib
from google.colab import drive
import os
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths
base_path = '/content/drive/My Drive/BTP/'
file_names = ['output_PA2R.csv', 'output_PA1B.csv', 'output_HD2R.csv', 'output_HD1C.csv', 'output_HD1B.csv']
file_paths = [os.path.join(base_path, file_name) for file_name in file_names]
column_names = ['C', 'LD', 'DI', 'L', 'L_CYL', 'TL', 'DO', 'SA_D', 'SA_CYL', 'SA', 'PO', 'M_NAME', 'PER_FIT', 'PR_NCC', 'MC']

# Load data with error handling and update M_NAME
data_frames = []
for file_path, file_name in zip(file_paths, file_names):
    try:
        df = pd.read_csv(file_path, names=column_names, header=0)
        # Extract material name from the file name (e.g., 'output_PA2R.csv' -> 'PA2R')
        material_name = file_name.split('_')[1].split('.')[0]
        # Update M_NAME column with the extracted material name
        df['M_NAME'] = material_name
        data_frames.append(df)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

In [None]:
# Combine the data frames
data = pd.concat(data_frames, ignore_index=True)

# Define important features and target columns
important_features = ['C', 'LD', 'DI', 'L_CYL', 'TL', 'PO', 'M_NAME']
target_columns = ['PER_FIT', 'PR_NCC', 'MC']

# Split data into features and targets
X = data[important_features]
y_per_fit = data['PER_FIT']
y_pr_ncc = data['PR_NCC']
y_mc = data['MC']

In [None]:
from category_encoders import LeaveOneOutEncoder

# Apply Leave-One-Out encoding to the categorical feature 'M_NAME'
encoder = LeaveOneOutEncoder(cols=['M_NAME'])
X_encoded = encoder.fit_transform(X, data['PER_FIT'])  # Use one of the target columns for encoding

# Split data into train and test sets
X_train, X_test, y_train_all, y_test_all = train_test_split(X_encoded, data[target_columns], test_size=0.2, random_state=42)

# Apply scaling only to numerical features
numerical_features = ['C', 'LD', 'DI', 'L_CYL', 'TL', 'PO']
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Save scaler and encoder for later use
joblib.dump(scaler, os.path.join(base_path, 'scaler_X.pkl'))
joblib.dump(encoder, os.path.join(base_path, 'encoder_X.pkl'))


In [None]:
# Train and evaluate CatBoost models for each target variable
for target in target_columns:
    y_train = y_train_all[target]
    y_test = y_test_all[target]

    model = cb.CatBoostRegressor(random_state=42, verbose=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    print(f'Target: {target}, CV R^2: {cv_scores.mean()}, MSE: {mse}, R^2: {r2}')

    # Save the model
    joblib.dump(model, os.path.join(base_path, f'model_{target}.pkl'))

    # Plot learning curves
    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=5, scoring='r2')
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, label='Training score')
    plt.plot(train_sizes, test_scores_mean, label='Cross-validation score')
    plt.title(f'Learning Curve for {target}')
    plt.xlabel('Training examples')
    plt.ylabel('R² score')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

    # Actual vs. Predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs. Predicted for {target}')
    plt.grid()
    plt.show()

    # Plot residuals
    plt.figure()
    plt.scatter(y_test, y_test - y_pred)
    plt.hlines(y=0, xmin=min(y_test), xmax=max(y_test))
    plt.xlabel('Actual values')
    plt.ylabel('Residuals')
    plt.title(f'Residuals Plot for {target}')
    plt.grid()
    plt.show()

    # Distribution of Residuals
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residuals')
    plt.title(f'Distribution of Residuals for {target}')
    plt.grid()
    plt.show()

    # Plot feature importance
    feature_importances = model.get_feature_importance()
    plt.figure()
    plt.barh(X_train.columns, feature_importances)
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance for {target}')
    plt.grid()
    plt.show()


In [None]:
!pip install category_encoders

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from category_encoders import LeaveOneOutEncoder
import os

# Define the input features based on your problem
input_data = {
    'C': 1,
    'LD': 2,
    'DI': 91.3913,
    'TL': 5,
    'L_CYL': 29110.8648,
    'PO': 201
}

# Possible values for M_NAME (note: this is just an example list)
materials = ['PA1B', 'PA2R', 'HD2R', 'HD1B', 'HD1C']

# Define base path (update this with the actual path)
base_path = '/content/drive/My Drive/BTP/'

# Load the trained models
models = {}
for target in ['MC', 'PR_NCC', 'PER_FIT']:
    models[target] = joblib.load(os.path.join(base_path, f'model_{target}.pkl'))

# Load the scaler and encoder
scaler = joblib.load(os.path.join(base_path, 'scaler_X.pkl'))
encoder = joblib.load(os.path.join(base_path, 'encoder_X.pkl'))

# Initialize variables to track the optimal material
optimal_material = None
min_mc = float('inf')
optimal_predictions = None

# Define numerical features based on your problem
numerical_features = ['C', 'LD', 'DI', 'L_CYL', 'TL', 'PO']

# Iterate over the possible values of M_NAME
for material in materials:
    input_data['M_NAME'] = material
    input_df = pd.DataFrame(input_data, index=[0])

    # Apply scaling to numerical features
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])

    # Apply encoding
    input_df = encoder.transform(input_df)

    # Predict using the models
    mc_pred = models['MC'].predict(input_df)[0]
    pr_ncc_pred = models['PR_NCC'].predict(input_df)[0]
    per_fit_pred = models['PER_FIT'].predict(input_df)[0]

    # Check the constraint for PR_NCC
    if pr_ncc_pred < 6:
        # Update the optimal material if the current one has a lower MC
        if mc_pred < min_mc:
            min_mc = mc_pred
            optimal_material = material
            optimal_predictions = {
                'MC': mc_pred,
                'PR_NCC': pr_ncc_pred,
                'PER_FIT': per_fit_pred,
                'M_NAME': material
            }

# Print the optimal material and the corresponding predictions
print(f"Optimal Material: {optimal_material}")
print(f"Predictions: {optimal_predictions}")

