In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, QuantileTransformer
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import re
import os
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

In [12]:
def clean_data(input_file='Data/ToxicityPhylogeneticChemical.csv'):    
    all_data = pd.read_csv(input_file)
    
    def extract_number(endpoint_str):
        if pd.isna(endpoint_str):
            return None
        match = re.search(r'(\d+\.?\d*)', str(endpoint_str))
        return float(match.group(1)) if match else None

    def extract_units(endpoint_str):
        if pd.isna(endpoint_str):
            return None
        endpoint_str = str(endpoint_str).lower()
        if 'mg/l' in endpoint_str:
            return 'mg/l'
        elif 'mg/kg' in endpoint_str:
            return 'mg/kg'
        else:
            return None

    all_data['Endpoint Val'] = all_data['Endpoint Value'].apply(extract_number)
    all_data['Endpoint Units'] = all_data['Endpoint Value'].apply(extract_units)
    all_data = all_data.drop(columns=['Endpoint Value'])

    initial_count = len(all_data)
    all_data = all_data[all_data['Endpoint Val'] > 0.005]
    all_data = all_data.dropna(subset=['Endpoint Description', 'class', 'order'])

    endpoint_95th = all_data['Endpoint Val'].quantile(0.95)
    all_data['Endpoint Val'] = all_data['Endpoint Val'].clip(lower=0.01, upper=endpoint_95th)

    class_features = pd.get_dummies(all_data['class'], prefix='class')
    order_features = pd.get_dummies(all_data['order'], prefix='order')    
    phylogenetic_features = pd.concat([class_features, order_features], axis=1)
    all_data = pd.concat([all_data, phylogenetic_features], axis=1)
        
    columns_to_drop = ['Unnamed: 0', 'Common Name', 'Range', 'Sex', 'Chemical', 
                      'ChemicalName', 'Sample Size', 'Tox Exposure', 'species', 
                      'class', 'order', 'family', 'genus']
    columns_to_drop = [col for col in columns_to_drop if col in all_data.columns]
    all_data = all_data.drop(columns=columns_to_drop)
    
    if all_data['XLogP'].isna().sum() > 0:
        xlogp_median = all_data['XLogP'].median()
        all_data['XLogP'] = all_data['XLogP'].fillna(xlogp_median)
    
    duration_map = ['Single dose','single', 'single dose', 'Single exposure']
    all_data['Tox Exposure Duration'] = all_data['Tox Exposure Duration'].replace(duration_map, 0.1)
    all_data['Tox Exposure Duration'] = all_data['Tox Exposure Duration'].fillna(0.1)
    all_data['Tox Exposure Duration'] = all_data['Tox Exposure Duration'].astype('float64')
        
    main_techniques = ['diet', 'waterborne', 'oral']
    all_data['Tox Exposure Technique'] = all_data['Tox Exposure Technique'].apply(
        lambda x: x if x in main_techniques else 'other'
    )

    all_data['Lipinski_Violations'] = (
        (all_data['MolecularWeight'] > 500).astype(int) +
        (all_data['XLogP'] > 5).astype(int) +
        (all_data['HBondDonorCount'] > 5).astype(int) +
        (all_data['HBondAcceptorCount'] > 10).astype(int)
    )
    all_data['Total_HBonds'] = all_data['HBondDonorCount'] + all_data['HBondAcceptorCount']
    
    all_data['MW_XLogP'] = all_data['MolecularWeight'] * all_data['XLogP']
    all_data['TPSA_HBonds'] = all_data['TPSA'] * all_data['Total_HBonds']
    
    scaler = StandardScaler()
    molecular_features = ['MolecularWeight', 'XLogP', 'TPSA', 'MW_XLogP', 'TPSA_HBonds']
    all_data[molecular_features] = scaler.fit_transform(all_data[molecular_features])

    technique_features = pd.get_dummies(all_data['Tox Exposure Technique'], prefix='technique')
    stage_features = pd.get_dummies(all_data['Life Cycle Stage'], prefix='stage')
    
    all_data = pd.concat([all_data, technique_features, stage_features], axis=1)
    all_data = all_data.drop(columns=['Tox Exposure Technique', 'Life Cycle Stage'])

    all_data['log_endpoint_val'] = np.log(all_data['Endpoint Val'])
    return all_data, scaler

In [15]:
class ThreeModelPipeline:
    def __init__(self):
        self.endpoint_type = RandomForestClassifier(random_state=60)
        self.endpoint_val = GradientBoostingRegressor(random_state=60)
        self.endpoint_units = RandomForestClassifier(random_state=60)
        self.endpoint_encoder = None
        self.units_encoder = None
        self.scaler = None
        self.feature_columns = {
            'model1': [],
            'model2': [],
            'model3': []
        }  

    def train_data(self, df, scaler):
        self.scaler = scaler
        training_data, testing_data = train_test_split(
            df, test_size=0.25, random_state=42, stratify=df['Endpoint Description']
        )
        
        print(f"Training samples: {len(training_data)}")
        print(f"Testing samples: {len(testing_data)}")
        
        self.endpoint_encoder = LabelEncoder()
        self.units_encoder = LabelEncoder()
        
        training_data = training_data.copy()
        testing_data = testing_data.copy()
        
        training_data['endpoint_type_encoded'] = self.endpoint_encoder.fit_transform(training_data['Endpoint Description'])
        training_data['units_encoded'] = self.units_encoder.fit_transform(training_data['Endpoint Units'])

        # Feature selection
        technique_cols = [col for col in training_data.columns if col.startswith('technique_')]
        stage_cols = [col for col in training_data.columns if col.startswith('stage_')]
        class_cols = [col for col in training_data.columns if col.startswith('class_')]
        order_cols = [col for col in training_data.columns if col.startswith('order_')]

        model1_features = technique_cols  
        model2_features = (stage_cols + class_cols + order_cols + technique_cols + 
                          ['MolecularWeight', 'XLogP', 'TPSA', 'HBondDonorCount', 'HBondAcceptorCount', 
                           'Lipinski_Violations', 'Total_HBonds', 'MW_XLogP', 'TPSA_HBonds'] + 
                          ['Tox Exposure Duration', 'endpoint_type_encoded'])
        model3_features = ['endpoint_type_encoded'] + technique_cols + class_cols

        self.feature_columns['model1'] = model1_features
        self.feature_columns['model2'] = model2_features
        self.feature_columns['model3'] = model3_features


        testing_data['endpoint_type_encoded'] = self.endpoint_encoder.transform(testing_data['Endpoint Description'])
        testing_data['units_encoded'] = self.units_encoder.transform(testing_data['Endpoint Units'])

        print("\n--- MODEL 1: Endpoint Type Classification ---")
        X1_train = training_data[model1_features]
        y1_train = training_data['endpoint_type_encoded']
        X1_test = testing_data[model1_features]
        y1_test = testing_data['endpoint_type_encoded']
        
        self.endpoint_type.fit(X1_train, y1_train)
        y1_pred = self.endpoint_type.predict(X1_test)
        model1_accuracy = accuracy_score(y1_test, y1_pred)

        print(f"Accuracy: {model1_accuracy:.3f}")
        print(classification_report(y1_test, y1_pred, target_names=self.endpoint_encoder.classes_))

        # MODEL 2
        print("\n--- MODEL 2: Endpoint Value Regression ---")
        X2_train = training_data[model2_features]
        X2_test = testing_data[model2_features]
        y2_train = training_data['Endpoint Val']
        y2_test = testing_data['Endpoint Val']

        self.endpoint_val.fit(X2_train, y2_train)
        y2_pred = self.endpoint_val.predict(X2_test)
        r2_orig = r2_score(y2_test, y2_pred)
        print(f"R2 Score: {r2_orig:.3f}")

        
        # MODEL 3
        print("\n--- MODEL 2: Endpoint Units Classification ---")
        X3_train = training_data[model3_features]
        y3_train = training_data['units_encoded']
        X3_test = testing_data[model3_features]
        y3_test = testing_data['units_encoded']

        self.endpoint_units.fit(X3_train, y3_train)
        y3_pred = self.endpoint_units.predict(X3_test)
        model3_accuracy = accuracy_score(y3_test, y3_pred)
        print(f"Accuracy: {model3_accuracy:.3f}")
        print(classification_report(y3_test, y3_pred, target_names=self.units_encoder.classes_))

    def save_models(self, model_dir='saved_models'):
        os.makedirs(model_dir, exist_ok=True)
        joblib.dump(self.endpoint_type, os.path.join(model_dir, 'endpoint_type_model.pkl'))
        joblib.dump(self.endpoint_val, os.path.join(model_dir, 'endpoint_val_model.pkl'))
        joblib.dump(self.endpoint_units, os.path.join(model_dir, 'endpoint_units_model.pkl'))
        joblib.dump(self.endpoint_encoder, os.path.join(model_dir, 'endpoint_encoder.pkl'))
        joblib.dump(self.units_encoder, os.path.join(model_dir, 'units_encoder.pkl'))
        joblib.dump(self.scaler, os.path.join(model_dir, 'scaler.pkl'))
        metadata = {
            'feature_columns': self.feature_columns,
            'endpoint_classes': self.endpoint_encoder.classes_.tolist(),
            'units_classes': self.units_encoder.classes_.tolist(),
        }
        with open(os.path.join(model_dir, 'model_metadata.pkl'), 'wb') as f:
            pickle.dump(metadata, f)

In [16]:
def main():
    cleaned_data, scaler = clean_data()
    model = ThreeModelPipeline()
    model.train_data(cleaned_data, scaler)
    model.save_models()

if __name__ == "__main__":
    main()

Training samples: 96
Testing samples: 32

--- MODEL 1: Endpoint Type Classification ---
Accuracy: 0.969
              precision    recall  f1-score   support

        LC50       0.93      1.00      0.96        13
        LD50       1.00      0.95      0.97        19

    accuracy                           0.97        32
   macro avg       0.96      0.97      0.97        32
weighted avg       0.97      0.97      0.97        32


--- MODEL 2: Endpoint Value Regression ---
R2 Score: 0.766

--- MODEL 2: Endpoint Units Classification ---
Accuracy: 1.000
              precision    recall  f1-score   support

       mg/kg       1.00      1.00      1.00        24
        mg/l       1.00      1.00      1.00         8

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

