In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, f1_score
import xgboost as xgb
import joblib
import os
import re
from typing import List, Tuple

def extract_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract numerical features from text columns.
    """
    df_processed = df.copy()
    
    # Extract temperature values
    temp_values = []
    for temp_str in df['Temp (Â°C)']:
        if pd.isna(temp_str) or temp_str == '':
            temp_values.append(np.nan)
        else:
            # Extract first number from temperature string
            numbers = re.findall(r'\d+\.?\d*', str(temp_str))
            temp_values.append(float(numbers[0]) if numbers else np.nan)
    
    df_processed['temp_numeric'] = temp_values
    
    # Extract time values
    time_values = []
    for time_str in df['Time (h)']:
        if pd.isna(time_str) or time_str == '':
            time_values.append(np.nan)
        else:
            # Extract first number from time string
            numbers = re.findall(r'\d+\.?\d*', str(time_str))
            time_values.append(float(numbers[0]) if numbers else np.nan)
    
    df_processed['time_numeric'] = time_values
    
    return df_processed

def prepare_features_and_targets(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    """
    Prepare features and targets for ML models.
    """
    # Extract numerical features
    df_processed = extract_numerical_features(df)
    
    # Define feature columns
    feature_cols = [
        'Metal source (mmol)', 'Linker(s) (mmol)', 'Solvent(s) (mL)', 
        'Modulator / Additive', 'Pressure'
    ]
    
    # Prepare features
    X = df_processed[feature_cols].copy()
    
    # Prepare targets
    y_temp = df_processed['temp_numeric']
    y_time = df_processed['time_numeric'] 
    y_method = df_processed['Method']
    
    return X, y_temp, y_time, y_method

def create_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    """
    Create preprocessing pipeline for features.
    """
    # Define categorical columns
    categorical_cols = ['Pressure']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('text', 'passthrough', ['Metal source (mmol)', 'Linker(s) (mmol)', 'Solvent(s) (mL)', 'Modulator / Additive'])
        ],
        remainder='drop'
    )
    
    return preprocessor

def train_models(X: pd.DataFrame, y_temp: pd.Series, y_time: pd.Series, y_method: pd.Series) -> dict:
    """
    Train XGBoost models for temperature, time, and method prediction.
    """
    # Split data
    X_train, X_test, y_temp_train, y_temp_test = train_test_split(
        X, y_temp, test_size=0.2, random_state=42
    )
    _, _, y_time_train, y_time_test = train_test_split(
        X, y_time, test_size=0.2, random_state=42
    )
    _, _, y_method_train, y_method_test = train_test_split(
        X, y_method, test_size=0.2, random_state=42
    )
    
    # Create preprocessor
    preprocessor = create_preprocessor(X_train)
    
    # Train temperature model
    temp_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(random_state=42, n_estimators=100))
    ])
    
    # Remove NaN values for temperature training
    temp_mask = ~y_temp_train.isna()
    temp_model.fit(X_train[temp_mask], y_temp_train[temp_mask])
    
    # Train time model
    time_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(random_state=42, n_estimators=100))
    ])
    
    # Remove NaN values for time training
    time_mask = ~y_time_train.isna()
    time_model.fit(X_train[time_mask], y_time_train[time_mask])
    
    # Train method model
    method_model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(random_state=42, n_estimators=100))
    ])
    
    method_model.fit(X_train, y_method_train)
    
    # Evaluate models
    print("Model Evaluation:")
    print("=" * 50)
    
    # Temperature evaluation
    temp_pred = temp_model.predict(X_test)
    temp_mae = mean_absolute_error(y_temp_test.dropna(), temp_pred[~y_temp_test.isna()])
    temp_rmse = np.sqrt(mean_squared_error(y_temp_test.dropna(), temp_pred[~y_temp_test.isna()]))
    print(f"Temperature Model - MAE: {temp_mae:.2f}, RMSE: {temp_rmse:.2f}")
    
    # Time evaluation
    time_pred = time_model.predict(X_test)
    time_mae = mean_absolute_error(y_time_test.dropna(), time_pred[~y_time_test.isna()])
    time_rmse = np.sqrt(mean_squared_error(y_time_test.dropna(), time_pred[~y_time_test.isna()]))
    print(f"Time Model - MAE: {time_mae:.2f}, RMSE: {time_rmse:.2f}")
    
    # Method evaluation
    method_pred = method_model.predict(X_test)
    method_acc = accuracy_score(y_method_test, method_pred)
    method_f1 = f1_score(y_method_test, method_pred, average='weighted')
    print(f"Method Model - Accuracy: {method_acc:.3f}, F1: {method_f1:.3f}")
    
    return {
        'preprocessor': preprocessor,
        'temp_model': temp_model,
        'time_model': time_model,
        'method_model': method_model
    }

# Load data
df = pd.read_csv('../data/processed/mof_runs.csv')

print(f"Loaded dataset with {len(df)} rows")
print("\nDataset info:")
print(df.info())

# Prepare features and targets
X, y_temp, y_time, y_method = prepare_features_and_targets(df)

print(f"\nFeatures shape: {X.shape}")
print(f"Temperature target shape: {y_temp.shape}")
print(f"Time target shape: {y_time.shape}")
print(f"Method target shape: {y_method.shape}")

# Train models
models = train_models(X, y_temp, y_time, y_method)

# Save models
os.makedirs('../models', exist_ok=True)

joblib.dump(models['preprocessor'], '../models/preproc.joblib')
joblib.dump(models['temp_model'], '../models/xgb_temp.joblib')
joblib.dump(models['time_model'], '../models/xgb_time.joblib')
joblib.dump(models['method_model'], '../models/xgb_method.joblib')

print("\nModels saved successfully!")
print("Saved files:")
print("- ../models/preproc.joblib")
print("- ../models/xgb_temp.joblib")
print("- ../models/xgb_time.joblib")
print("- ../models/xgb_method.joblib")
