In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Load the dataset
df = pd.read_csv('/content/drug_loaded_nanoemulsion_1500 (1).csv')

# 2. Define Features and Targets
# Features: All columns except ID, PDI, and Stability_Target
# Targets: PDI (Regression) and Stability_Target (Classification)
X = df.drop(columns=['ID', 'PDI', 'Stability_Target'])
y_pdi = df['PDI']
y_stability = df['Stability_Target']

# Identify categorical and numerical columns
categorical_cols = ['Oil_Type', 'Surfactant_Type', 'Drug_Name']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# 3. Preprocessing Pipeline
# We use OneHotEncoder for categorical variables and keep numerical variables as is
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# 4. Split the data into Training and Testing sets
X_train, X_test, y_train_pdi, y_test_pdi, y_train_stb, y_test_stb = train_test_split(
    X, y_pdi, y_stability, test_size=0.2, random_state=42
)

# 5. Model for PDI Prediction (Regression)
pdi_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 6. Model for Stability Prediction (Classification)
stb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 7. Train the models
print("Training models...")
pdi_model.fit(X_train, y_train_pdi)
stb_model.fit(X_train, y_train_stb)

# 8. Evaluate PDI Model
y_pred_pdi = pdi_model.predict(X_test)
print("\n--- PDI Prediction Metrics ---")
print(f"Mean Squared Error: {mean_squared_error(y_test_pdi, y_pred_pdi):.4f}")
print(f"R2 Score: {r2_score(y_test_pdi, y_pred_pdi):.4f}")

# 9. Evaluate Stability Model
y_pred_stb = stb_model.predict(X_test)
print("\n--- Stability Prediction Metrics ---")
print(f"Accuracy: {accuracy_score(y_test_stb, y_pred_stb):.4f}")
print("Classification Report:")
print(classification_report(y_test_stb, y_pred_stb))

# 10. Example: Predict for a new formulation
def predict_formulation(input_data):
    """
    input_data: A dictionary containing the feature values
    """
    input_df = pd.DataFrame([input_data])
    pdi_val = pdi_model.predict(input_df)[0]
    stb_val = stb_model.predict(input_df)[0]

    stability_label = "Stable" if stb_val == 1 else "Unstable"
    return pdi_val, stability_label

# Example Usage:
new_input = {
    'Oil_Type': 'Oleic Acid',
    'Surfactant_Type': 'Tween 80',
    'Required_HLB': 17.0,
    'Oil_MW': 282.5,
    'Surf_HLB': 15.0,
    'Surf_MW': 1310,
    'Smix_Ratio': 1,
    'System_HLB': 10.0,
    'Drug_Name': 'Curcumin',
    'Drug_Loading_mg_mL': 16.5,
    'EE_Percent': 82.0
}

predicted_pdi, predicted_stability = predict_formulation(new_input)
print(f"\n--- Prediction for New Input ---")
print(f"Predicted PDI: {predicted_pdi:.4f}")
print(f"Predicted Stability: {predicted_stability}")

Training models...

--- PDI Prediction Metrics ---
Mean Squared Error: 0.0090
R2 Score: 0.0367

--- Stability Prediction Metrics ---
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       242
           1       1.00      1.00      1.00        58

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


--- Prediction for New Input ---
Predicted PDI: 0.3084
Predicted Stability: Unstable
