In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
#check the dataset
df = pd.read_csv( 'PANCANCER_ANOVA_Sun Oct 13 11_35_35 2024.csv')

In [3]:
df1 = df.drop(columns=['log_max_conc_tested', 'log_max_conc_tested_2'])

In [4]:
missing_data = df1.isnull().sum()

In [5]:
df1 = df1.dropna(subset=['Drug target', 'Target Pathway'])

In [6]:
df2 = df1.drop_duplicates()

In [7]:
# Step 3: Separate features and target variable 
X = df2.drop(columns=['ic50_effect_size', 'Drug name', 'Drug ID', 'Feature Name', 'Tissue Type', 'Screening Set'])
y = df2['ic50_effect_size']

In [8]:
# Step 4: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.columns)

Index(['Drug target', 'Target Pathway', 'n_feature_pos', 'n_feature_neg',
       'log_ic50_mean_pos', 'log_ic50_mean_neg', 'feature_ic50_t_pval',
       'feature_delta_mean_ic50', 'feature_pos_ic50_var',
       'feature_neg_ic50_var', 'feature_pval', 'tissue_pval', 'msi_pval',
       'fdr'],
      dtype='object')


In [9]:
print(X_test.columns)

Index(['Drug target', 'Target Pathway', 'n_feature_pos', 'n_feature_neg',
       'log_ic50_mean_pos', 'log_ic50_mean_neg', 'feature_ic50_t_pval',
       'feature_delta_mean_ic50', 'feature_pos_ic50_var',
       'feature_neg_ic50_var', 'feature_pval', 'tissue_pval', 'msi_pval',
       'fdr'],
      dtype='object')


In [10]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [11]:
categorical_features = ['Drug target', 'Target Pathway']
numerical_features = ['n_feature_pos', 'n_feature_neg', 'log_ic50_mean_pos', 'log_ic50_mean_neg', 'feature_ic50_t_pval', 'feature_delta_mean_ic50', 'feature_pos_ic50_var', 'feature_neg_ic50_var', 'feature_pval', 'tissue_pval', 'msi_pval', 'fdr']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_features)  # Encode categorical features
    ]
)

In [12]:
print(numerical_features)
print(categorical_features)

['n_feature_pos', 'n_feature_neg', 'log_ic50_mean_pos', 'log_ic50_mean_neg', 'feature_ic50_t_pval', 'feature_delta_mean_ic50', 'feature_pos_ic50_var', 'feature_neg_ic50_var', 'feature_pval', 'tissue_pval', 'msi_pval', 'fdr']
['Drug target', 'Target Pathway']


In [13]:
# Step 7: Apply scaling and encoding
# Pipeline: This is a class provided by Scikit-learn that allows you to chain multiple steps together into a single object. 
# The steps are performed sequentially.
# 

pipeline = Pipeline([
    ('preprocessor', preprocessor) 
])
# fit_transform: This method is used to both fit the transformer (learn the required parameters) and then transform 
# the data based on those learned parameters.
X_train_scaled = pipeline.fit_transform(X_train)
# This is because you should not fit the transformer on test data (which could lead to data leakage). 
# Instead, you use the parameters that were learned from the training data during fit_transform.
The transform method applies the scaling and encoding to X_test using the precomputed parameters:
X_test_scaled = pipeline.transform(X_test)

In [14]:
#  This initializes an XGBoost Regressor. XGBoost (Extreme Gradient Boosting) is a powerful and efficient machine learning algorithm 
#commonly used for regression and classification tasks, particularly with structured/tabular data. It works by building an ensemble of decision trees 
#in a sequential manner, each tree attempting to correct the errors made by the previous ones.

model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6)
model.fit(X_train_scaled, y_train)

# Predict using XGBoost
y_pred_xgb = model.predict(X_test_scaled)

# Calculate MAE and r^2
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost MAE: {mae_xgb}")
print(f"XGBoost r^2: {r2_xgb}")

XGBoost MAE: 0.006167793665404644
XGBoost r^2: 0.9947604405887469


In [15]:
# Save the model
joblib.dump(model, 'Model.pkl')

['Model.pkl']

In [16]:
# Save the preprocessing pipeline
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']