In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import graphviz
from IPython.display import display

In [3]:
# Load the dataset
data = pd.read_csv('/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv')

In [4]:
output_dir = "/kaggle/working/"

In [5]:
data.head().T

Unnamed: 0,0,1,2,3,4
COSMIC_ID,683667,687448,687452,687455,687457
CELL_LINE_NAME,PFSK-1,COLO-829,5637,RT4,SW780
TCGA_DESC,MB,SKCM,BLCA,BLCA,BLCA
DRUG_ID,1003,1003,1003,1003,1003
DRUG_NAME,Camptothecin,Camptothecin,Camptothecin,Camptothecin,Camptothecin
LN_IC50,-1.463887,-1.235034,-2.632632,-2.963191,-1.449138
AUC,0.93022,0.867348,0.834067,0.821438,0.90505
Z_SCORE,0.433123,0.557727,-0.203221,-0.3832,0.441154
GDSC Tissue descriptor 1,nervous_system,skin,Unknown,urogenital_system,urogenital_system
GDSC Tissue descriptor 2,medulloblastoma,melanoma,Unknown,Bladder,Bladder


In [6]:
# Create a binary target variable based on LN_IC50
# Lower values indicate higher sensitivity
data['Sensitivity'] = np.where(data['LN_IC50'] <= data['LN_IC50'].median(), 'Sensitive', 'Resistant')

# Select features for modeling
features = ['TCGA_DESC', 'GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2', 
           'Cancer Type (matching TCGA label)', 'Microsatellite instability Status (MSI)',
           'Growth Properties', 'CNA', 'Gene Expression', 'Methylation', 
           'TARGET', 'TARGET_PATHWAY']

X = data[features]
y = data['Sensitivity']

In [7]:
# Split data into training and testing sets (disjoint)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing for categorical and numerical features
categorical_features = ['TCGA_DESC', 'GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2', 
                        'Cancer Type (matching TCGA label)', 'Microsatellite instability Status (MSI)',
                        'Growth Properties', 'TARGET', 'TARGET_PATHWAY']
numerical_features = ['CNA', 'Gene Expression', 'Methylation']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [8]:
# Fit the preprocessor on training data to get transformed feature names
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Get feature names after preprocessing
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(categorical_features)
processed_feature_names = np.append(numerical_features, cat_feature_names)

In [9]:
X_train.head().T

Unnamed: 0,173490,166374,104586,181915,83290
TCGA_DESC,HNSC,BLCA,ALL,LUAD,LUAD
GDSC Tissue descriptor 1,aero_dig_tract,urogenital_system,leukemia,lung_NSCLC,lung_NSCLC
GDSC Tissue descriptor 2,head and neck,Bladder,T_cell_leukemia,lung_NSCLC_adenocarcinoma,lung_NSCLC_adenocarcinoma
Cancer Type (matching TCGA label),HNSC,BLCA,ALL,LUAD,LUAD
Microsatellite instability Status (MSI),MSS/MSI-L,MSS/MSI-L,MSI-H,MSS/MSI-L,MSS/MSI-L
Growth Properties,Adherent,Adherent,Suspension,Adherent,Adherent
CNA,1,1,1,1,1
Gene Expression,1,1,1,1,1
Methylation,1,1,1,1,1
TARGET,TERT,Unknown,JAK1,PMRT5,"BAZ2A, BAZ2B"


In [10]:
X_test.head().T

Unnamed: 0,22635,170739,183345,163720,144318
TCGA_DESC,HNSC,SKCM,HNSC,COREAD,UNCLASSIFIED
GDSC Tissue descriptor 1,aero_dig_tract,skin,aero_dig_tract,large_intestine,leukemia
GDSC Tissue descriptor 2,head and neck,melanoma,head and neck,large_intestine,B_cell_leukemia
Cancer Type (matching TCGA label),HNSC,SKCM,HNSC,COAD/READ,Unknown
Microsatellite instability Status (MSI),MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSI-H
Growth Properties,Adherent,Adherent,Adherent,Adherent,Suspension
CNA,1,1,1,1,1
Gene Expression,1,1,1,1,1
Methylation,1,1,1,1,1
TARGET,JNK,"G9A, GLP",Unknown,MCL1,"PI3Kalpha, PI3Kdelta, PI3Kbeta, PI3Kgamma"


In [11]:
y_train

173490    Resistant
166374    Resistant
104586    Resistant
181915    Resistant
83290     Resistant
            ...    
57986     Resistant
6480      Sensitive
129385    Sensitive
56110     Sensitive
107433    Resistant
Name: Sensitivity, Length: 158673, dtype: object

In [12]:
y_test

22635     Resistant
170739    Resistant
183345    Sensitive
163720    Resistant
144318    Sensitive
            ...    
3195      Resistant
82721     Resistant
36399     Sensitive
140638    Resistant
151919    Resistant
Name: Sensitivity, Length: 39669, dtype: object

In [13]:
X_train.to_csv(f"{output_dir}/DT_Xtrain.csv", index=False)
print(f"X_train dataset saved to {output_dir}/DT_Xtrain.csv")

X_test.to_csv(f"{output_dir}/DT_Xtest.csv", index=False)
print(f"X_test dataset saved to {output_dir}/DT_Xtest.csv")

y_train.to_csv(f"{output_dir}/DT_ytrain.csv", index=False)
print(f"y_train dataset saved to {output_dir}/DT_ytrain.csv")

y_test.to_csv(f"{output_dir}/DT_ytest.csv", index=False)
print(f"y_test dataset saved to {output_dir}/DT_ytest.csv")

X_train dataset saved to /kaggle/working//DT_Xtrain.csv
X_test dataset saved to /kaggle/working//DT_Xtest.csv
y_train dataset saved to /kaggle/working//DT_ytrain.csv
y_test dataset saved to /kaggle/working//DT_ytest.csv


In [14]:
# Build three different decision tree models

# Model 1: Standard Decision Tree
dt1 = DecisionTreeClassifier(random_state=42)
dt1.fit(X_train_preprocessed, y_train)

# Model 2: Tree with entropy criterion and maximum depth constraint
dt2 = DecisionTreeClassifier(random_state=42, criterion="entropy", max_depth=15)
dt2.fit(X_train_preprocessed, y_train)

# Model 3: Tree with Gini criterion and minimum samples split constraint at a different depth
dt3 = DecisionTreeClassifier(random_state=42, criterion="gini", min_samples_split=20)
dt3.fit(X_train_preprocessed, y_train)

In [15]:
# Transform test data for predictions
X_test_preprocessed = preprocessor.transform(X_test)

# Make predictions
y_pred1 = dt1.predict(X_test_preprocessed)
y_pred2 = dt2.predict(X_test_preprocessed)
y_pred3 = dt3.predict(X_test_preprocessed)

In [16]:
# Evaluate models and save confusion matrices
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n")
    
    # Plot and save confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Resistant', 'Sensitive'],
               yticklabels=['Resistant', 'Sensitive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    
    # Save to file
    filename = f'/kaggle/working/confusion_matrix_{model_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    return accuracy, conf_matrix

# Evaluate all three models
accuracy1, cm1 = evaluate_model(y_test, y_pred1, "Standard Decision Tree")
accuracy2, cm2 = evaluate_model(y_test, y_pred2, "Decision Tree with Max Depth 15 and criterion as Entropy")
accuracy3, cm3 = evaluate_model(y_test, y_pred3, "Decision Tree with Min Split 20 and criterion as Gini")

Model: Standard Decision Tree
Accuracy: 0.7423
Confusion Matrix:
[[14658  5177]
 [ 5044 14790]]
Classification Report:
              precision    recall  f1-score   support

   Resistant       0.74      0.74      0.74     19835
   Sensitive       0.74      0.75      0.74     19834

    accuracy                           0.74     39669
   macro avg       0.74      0.74      0.74     39669
weighted avg       0.74      0.74      0.74     39669



Model: Decision Tree with Max Depth 15 and criterion as Entropy
Accuracy: 0.6282
Confusion Matrix:
[[16305  3530]
 [11217  8617]]
Classification Report:
              precision    recall  f1-score   support

   Resistant       0.59      0.82      0.69     19835
   Sensitive       0.71      0.43      0.54     19834

    accuracy                           0.63     39669
   macro avg       0.65      0.63      0.61     39669
weighted avg       0.65      0.63      0.61     39669



Model: Decision Tree with Min Split 20 and criterion as Gini
Accuracy:

In [17]:
# Visualize the decision trees and save to files
def visualize_tree(dt_model, feature_names, class_names, max_depth=3, model_name="Decision Tree"):
    plt.figure(figsize=(40, 20))
    plot_tree(dt_model, 
              max_depth=max_depth,
              feature_names=feature_names,
              class_names=class_names,
              filled=True, 
              rounded=True,
              fontsize=9)
    plt.title(f"{model_name} Visualization (Max Depth={max_depth})")
    plt.tight_layout()
    
    # Save to file
    filename = f'/kaggle/working/tree_viz_{model_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    # For more detailed visualization, also save using export_graphviz
    try:
        dot_data = export_graphviz(
            dt_model,
            max_depth=max_depth,
            feature_names=feature_names,
            class_names=class_names,
            filled=True,
            rounded=True,
            special_characters=True,
            out_file=None
        )
        # Save the dot file for later use
        with open(f'/kaggle/working/tree_{model_name.replace(" ", "_")}.dot', 'w') as f:
            f.write(dot_data)
    except Exception as e:
        print(f"Could not export dot file: {e}")

# Class names for visualization
class_names = ['Resistant', 'Sensitive']

# Visualize the three trees
visualize_tree(dt1, processed_feature_names, class_names, model_name="Standard_DT")
visualize_tree(dt2, processed_feature_names, class_names, model_name="DT_MaxDepth15_Entropy")
visualize_tree(dt3, processed_feature_names, class_names, model_name="DT_MinSamplesSplit20_Gini")

In [18]:
# Feature importance analysis and save to files
def plot_feature_importance(dt_model, feature_names, model_name="Decision Tree"):
    # Get feature importances
    importances = dt_model.feature_importances_
    
    # Create DataFrame for visualization - handle case where importances length doesn't match feature_names
    indices = np.argsort(importances)[::-1]
    top_indices = indices[:min(10, len(importances))]
    
    # Use only available features
    valid_features = [feature_names[i] if i < len(feature_names) else f"Feature_{i}" for i in top_indices]
    valid_importances = [importances[i] for i in top_indices]
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'Feature': valid_features,
        'Importance': valid_importances
    })
    
    # Plot
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Top 10 Feature Importances - {model_name}')
    plt.tight_layout()
    
    # Save to file
    filename = f'/kaggle/working/feature_importance_{model_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Also save the feature importance data to CSV for reference
    importance_df.to_csv(f'/kaggle/working/feature_importance_{model_name.replace(" ", "_")}.csv', index=False)

# Plot feature importance for all models
plot_feature_importance(dt1, processed_feature_names, "Standard_DT")
plot_feature_importance(dt2, processed_feature_names, "DT_MaxDepth15_Entropy")
plot_feature_importance(dt3, processed_feature_names, "DT_MinSamplesSplit20_Gini")

In [19]:
# Compare model performances and save to file
model_names = ['Standard DT', 'DT with Max Depth 15 and criterion as Entropy', 'DT with Min Samples Split 20 and criterion as Gini']
accuracies = [accuracy1, accuracy2, accuracy3]

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of Decision Tree Model Accuracies')
plt.ylim(0, 1)
plt.tight_layout()

# Save accuracy comparison
plt.savefig('/kaggle/working/model_accuracy_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

  order = pd.unique(vector)
