In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Load the dataset
gdsc_data = pd.read_csv('/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv')

In [4]:
# First, we need to convert LN_IC50 into a binary classification target
# Lower LN_IC50 values indicate higher drug sensitivity
# We'll use the median as a threshold to classify as sensitive (1) or resistant (0)
sensitivity_threshold = gdsc_data['LN_IC50'].median()
gdsc_data['Drug_Sensitivity'] = (gdsc_data['LN_IC50'] < sensitivity_threshold).astype(int)
print(f"Using LN_IC50 threshold of {sensitivity_threshold:.4f} for classification")
print(f"Class distribution: {gdsc_data['Drug_Sensitivity'].value_counts()}")

Using LN_IC50 threshold of 3.5575 for classification
Class distribution: Drug_Sensitivity
1    99171
0    99171
Name: count, dtype: int64


In [5]:
# Select relevant features for the model
# Numerical features
numeric_features = ['AUC', 'Z_SCORE', 'CNA', 'Gene Expression', 'Methylation']

# Categorical features (we'll encode these)
categorical_features = [
    'GDSC Tissue descriptor 1', 
    'Cancer Type (matching TCGA label)',
    'Microsatellite instability Status (MSI)',
    'TARGET',
    'TARGET_PATHWAY'
]

In [6]:
# Prepare features and target
X = gdsc_data[numeric_features + categorical_features]
y = gdsc_data['Drug_Sensitivity']

In [7]:
# Save the original feature set before transformation
X.to_csv('features_before_transformation.csv', index=False)
print("Saved original feature set before transformation to 'features_before_transformation.csv'")

Saved original feature set before transformation to 'features_before_transformation.csv'


In [8]:
# Create preprocessor using ColumnTransformer
# Using sparse=False for OneHotEncoder to avoid sparse matrix output
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Save the original train and test feature sets before transformation
X_train.to_csv('X_train_before_transformation.csv', index=False)
X_test.to_csv('X_test_before_transformation.csv', index=False)
print("Saved train and test feature sets before transformation")

Training set shape: (138839, 10)
Testing set shape: (59503, 10)
Saved train and test feature sets before transformation


In [10]:
X_train.head().T

Unnamed: 0,184213,125178,141898,62145,55132
AUC,0.988816,0.961416,0.974135,0.988955,0.968834
Z_SCORE,-0.422874,-0.800332,-0.471554,1.096069,-0.479448
CNA,1,1,1,1,1
Gene Expression,1,1,1,1,1
Methylation,1,1,1,1,1
GDSC Tissue descriptor 1,lymphoma,aero_dig_tract,bone,lung_NSCLC,lymphoma
Cancer Type (matching TCGA label),Unknown,ESCA,Unknown,LUSC,Unknown
Microsatellite instability Status (MSI),MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L
TARGET,Unknown,ESR,"TNKS1, TNKS2",DNA alkylating agent,ESR
TARGET_PATHWAY,Unclassified,Hormone-related,WNT signaling,DNA replication,Hormone-related


In [11]:
X_test.head().T

Unnamed: 0,175400,173880,147597,188281,113681
AUC,0.81517,0.886714,0.726359,0.94894,0.983236
Z_SCORE,-0.989111,-0.398444,-1.133064,-0.006103,0.658191
CNA,1,1,1,1,1
Gene Expression,1,1,1,1,1
Methylation,1,1,1,1,1
GDSC Tissue descriptor 1,lung_NSCLC,skin,urogenital_system,nervous_system,nervous_system
Cancer Type (matching TCGA label),LUAD,SKCM,CESC,GBM,GBM
Microsatellite instability Status (MSI),MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L,MSS/MSI-L
TARGET,"CHEK1, CHEK2, CDK2",G-quadruplex stabiliser,"AKT1, AKT2, AKT3",EP300,"ADRA1A, ADRB1"
TARGET_PATHWAY,Cell cycle,DNA replication,PI3K/MTOR signaling,Chromatin histone methylation,Other


In [12]:
# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Get feature names after transformation
numeric_feature_names = numeric_features
try:
    categorical_feature_names = preprocessor.transformers_[1][1].get_feature_names_out(categorical_features)
except:
    # Fallback if get_feature_names_out is not available
    categorical_feature_names = [f"cat_feature_{i}" for i in range(X_train_transformed.shape[1] - len(numeric_features))]

# Create column names for the transformed data
transformed_feature_names = list(numeric_feature_names) + list(categorical_feature_names)

# Create DataFrames for the transformed data
# Ensure we don't have more column names than columns in the data
if len(transformed_feature_names) != X_train_transformed.shape[1]:
    print(f"Warning: Feature name count ({len(transformed_feature_names)}) doesn't match feature count ({X_train_transformed.shape[1]})")
    # Create generic column names if there's a mismatch
    transformed_feature_names = [f"feature_{i}" for i in range(X_train_transformed.shape[1])]

# Create DataFrames with the transformed data
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=transformed_feature_names)

# Save the transformed train and test feature sets
X_train_transformed_df.to_csv('X_train_after_transformation.csv', index=False)
X_test_transformed_df.to_csv('X_test_after_transformation.csv', index=False)
print("Saved train and test feature sets after transformation")

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Transformed training set shape: {X_train_transformed.shape}")
print(f"Transformed testing set shape: {X_test_transformed.shape}")

Saved train and test feature sets after transformation
Training set shape: (138839, 10)
Testing set shape: (59503, 10)
Transformed training set shape: (138839, 268)
Transformed testing set shape: (59503, 268)


In [13]:
X_train_transformed_df.head().T

Unnamed: 0,0,1,2,3,4
AUC,0.907884,0.518976,0.699506,0.909857,0.624265
Z_SCORE,-0.601823,-1.025983,-0.656526,1.105057,-0.665396
CNA,0.000000,0.000000,0.000000,0.000000,0.000000
Gene Expression,0.000000,0.000000,0.000000,0.000000,0.000000
Methylation,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
TARGET_PATHWAY_Protein stability and degradation,0.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_RTK signaling,0.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_Unclassified,1.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_WNT signaling,0.000000,0.000000,1.000000,0.000000,0.000000


In [14]:
X_test_transformed_df.head().T

Unnamed: 0,0,1,2,3,4
AUC,-1.556796,-0.541322,-2.817353,0.341896,0.828683
Z_SCORE,-1.238119,-0.574370,-1.399884,-0.133485,0.613001
CNA,0.000000,0.000000,0.000000,0.000000,0.000000
Gene Expression,0.000000,0.000000,0.000000,0.000000,0.000000
Methylation,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
TARGET_PATHWAY_Protein stability and degradation,0.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_RTK signaling,0.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_Unclassified,0.000000,0.000000,0.000000,0.000000,0.000000
TARGET_PATHWAY_WNT signaling,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
# Example of polynomial kernel transformation for a 2D point
def polynomial_kernel_transformation(x1, x2, r=1, d=2):
    """
    Demonstration of polynomial kernel transformation for a 2D point
    """
    # Original 2D point
    point = np.array([x1, x2])
    print(f"Original 2D point: {point}")
    
    # For polynomial kernel with d=2, r=1, the transformation is:
    # Φ(x) = [1, √2x₁, √2x₂, x₁², √2x₁x₂, x₂²]
    transformed = np.array([
        1, 
        np.sqrt(2)*x1, 
        np.sqrt(2)*x2, 
        x1**2, 
        np.sqrt(2)*x1*x2, 
        x2**2
    ])
    
    print(f"Transformed point (6D): {transformed}")
    
    # Visualize the transformation
    plt.figure(figsize=(10, 5))
    
    # Original 2D space
    plt.subplot(1, 2, 1)
    plt.scatter(x1, x2, color='red', s=100)
    plt.grid(True)
    plt.xlim(-2, 2)
    plt.ylim(-2, 2)
    plt.title('Original 2D Space')
    plt.xlabel('x₁')
    plt.ylabel('x₂')
    
    # Visualize 3 dimensions of the transformed space
    plt.subplot(1, 2, 2)
    ax = plt.axes(projection='3d')
    ax.scatter3D(x1**2, np.sqrt(2)*x1*x2, x2**2, color='blue', s=100)
    ax.set_xlabel('x₁²')
    ax.set_ylabel('√2x₁x₂')
    ax.set_zlabel('x₂²')
    ax.set_title('3D Projection of 6D Transformed Space')
    
    plt.tight_layout()
    plt.savefig('polynomial_kernel_transformation.png')
    plt.close()
    
    return transformed


# Demonstrate polynomial kernel transformation
print("\nDemonstration of polynomial kernel transformation:")
polynomial_kernel_transformation(0.5, -0.3, r=1, d=2)


Demonstration of polynomial kernel transformation:
Original 2D point: [ 0.5 -0.3]
Transformed point (6D): [ 1.          0.70710678 -0.42426407  0.25       -0.21213203  0.09      ]


array([ 1.        ,  0.70710678, -0.42426407,  0.25      , -0.21213203,
        0.09      ])

In [16]:
# Create SVM pipelines with different kernels
kernels = ['linear', 'poly', 'rbf']
C_values = [0.1, 1, 10, 100]

results = {}

In [17]:
# Given the large dataset size, we'll use a subset for initial model training
# This is a common approach for computational efficiency
sample_size = min(30000, len(X_train))  # Use at most 30,000 samples
indices = np.random.choice(len(X_train), sample_size, replace=False)
X_train_sample = X_train.iloc[indices]
y_train_sample = y_train.iloc[indices]

print(f"\nUsing {sample_size} samples for model training")


Using 30000 samples for model training


In [18]:
# For visualization of decision boundaries, we need a 2D dataset
# Create a simplified 2D dataset using the first two numeric features
X_2d = X[[numeric_features[0], numeric_features[1]]]  # Using AUC and Z_SCORE
X_2d_train, X_2d_test, y_2d_train, y_2d_test = train_test_split(
    X_2d, y, test_size=0.3, random_state=42, stratify=y
)

In [19]:
# Standardize the 2D dataset
scaler_2d = StandardScaler()
X_2d_train_scaled = scaler_2d.fit_transform(X_2d_train)
X_2d_test_scaled = scaler_2d.transform(X_2d_test)

In [20]:
# For SVM decision boundary visualization
def plot_decision_boundary(X, y, model, title, filename):
    # Create a mesh grid
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Get predictions for each point in the mesh
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    
    # Plot the decision boundary
    plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdBu)
    
    # Plot the training points
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.RdBu)
    
    plt.title(title)
    plt.xlabel(numeric_features[0])
    plt.ylabel(numeric_features[1])
    plt.colorbar(scatter)
    plt.savefig(filename)
    plt.close()

In [21]:
for kernel in kernels:
    print(f"\n--- SVM with {kernel} kernel ---")
    results[kernel] = []
    
    for C in C_values:
        # Create and train SVM pipeline
        svm_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', SVC(kernel=kernel, C=C, random_state=42, probability=True))
        ])
        
        # Fit on training data
        svm_pipeline.fit(X_train_sample, y_train_sample)
        
        # Predict on test set
        y_pred = svm_pipeline.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Store results
        results[kernel].append({
            'C': C,
            'accuracy': accuracy,
            'predictions': y_pred,
            'pipeline': svm_pipeline
        })
        
        print(f"C = {C}, Accuracy: {accuracy:.4f}")
        
        # Create and save confusion matrix for this model
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        
        # Calculate sensitivity and specificity
        tn, fp, fn, tp = cm.ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        
        # Display confusion matrix as heatmap
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"{kernel.capitalize()} Kernel (C={C})\n" +
                  f"Accuracy: {accuracy:.4f}\n" +
                  f"Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{kernel}_C_{C}.png')
        plt.close()
        
        # Save classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        
        # Visualize classification report
        plt.figure(figsize=(10, 6))
        sns.heatmap(report_df.iloc[:-1, :].astype(float), annot=True, cmap='viridis')
        plt.title(f'Classification Report: {kernel.capitalize()} Kernel (C={C})')
        plt.tight_layout()
        plt.savefig(f'classification_report_{kernel}_C_{C}.png')
        plt.close()


--- SVM with linear kernel ---
C = 0.1, Accuracy: 0.8920
C = 1, Accuracy: 0.9000
C = 10, Accuracy: 0.9009
C = 100, Accuracy: 0.9009

--- SVM with poly kernel ---
C = 0.1, Accuracy: 0.8699
C = 1, Accuracy: 0.9059
C = 10, Accuracy: 0.9066
C = 100, Accuracy: 0.9012

--- SVM with rbf kernel ---
C = 0.1, Accuracy: 0.8700
C = 1, Accuracy: 0.9059
C = 10, Accuracy: 0.9104
C = 100, Accuracy: 0.9089


In [22]:
# Find the best model for each kernel
best_models = {}
for kernel in kernels:
    best_idx = np.argmax([result['accuracy'] for result in results[kernel]])
    best_models[kernel] = results[kernel][best_idx]
    print(f"\nBest {kernel} model: C={best_models[kernel]['C']}, Accuracy={best_models[kernel]['accuracy']:.4f}")


Best linear model: C=100, Accuracy=0.9009

Best poly model: C=10, Accuracy=0.9066

Best rbf model: C=10, Accuracy=0.9104


In [23]:
print("\n--- Training 2D models with best C values for decision boundary visualization ---")
for kernel in kernels:
    best_C = best_models[kernel]['C']
    # Train SVM model on 2D data with best C value for this kernel
    model_2d = SVC(kernel=kernel, C=best_C, random_state=42)
    model_2d.fit(X_2d_train_scaled, y_2d_train)
    
    # Calculate accuracy on test set
    accuracy = accuracy_score(y_2d_test, model_2d.predict(X_2d_test_scaled))
    print(f"2D {kernel.capitalize()} SVM with best C={best_C}: Accuracy = {accuracy:.4f}")
    
    # Plot decision boundary
    plot_decision_boundary(
        X_2d_train_scaled, y_2d_train, model_2d, 
        f"{kernel.capitalize()} SVM Decision Boundary (C={best_C}, Accuracy={accuracy:.4f})",
        f"{kernel}_svm_best_decision_boundary.png"
    )


--- Training 2D models with best C values for decision boundary visualization ---
2D Linear SVM with best C=100: Accuracy = 0.8050
2D Poly SVM with best C=10: Accuracy = 0.7979
2D Rbf SVM with best C=10: Accuracy = 0.8051


In [24]:
# Visualize confusion matrices for the best models
plt.figure(figsize=(18, 6))

for i, kernel in enumerate(kernels):
    plt.subplot(1, 3, i+1)
    cm = confusion_matrix(y_test, best_models[kernel]['predictions'])
    
    # Calculate sensitivity and specificity
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    
    # Display confusion matrix as heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{kernel.capitalize()} Kernel (C={best_models[kernel]['C']})\n" +
              f"Accuracy: {best_models[kernel]['accuracy']:.4f}\n" +
              f"Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plt.tight_layout()
plt.savefig('svm_confusion_matrices.png')
plt.close()

In [25]:
# Compare kernel performances
plt.figure(figsize=(10, 6))
for kernel in kernels:
    accuracies = [result['accuracy'] for result in results[kernel]]
    plt.plot(C_values, accuracies, marker='o', label=kernel)

plt.xscale('log')
plt.xlabel('Cost Parameter (C)')
plt.ylabel('Accuracy')
plt.title('SVM Performance Comparison Across Kernels')
plt.legend()
plt.grid(True)
plt.savefig('svm_kernel_comparison.png')
plt.close()

In [26]:
# Feature importance analysis
# Since SVMs don't provide feature importance directly, we'll analyze coefficients for linear kernel
if 'linear' in best_models:
    try:
        linear_model = best_models['linear']['pipeline']
        # Extract the SVC classifier from the pipeline
        svc = linear_model.named_steps['classifier']
        
        # Get feature names after preprocessing
        # For numerical features, names remain the same
        numeric_feature_names = numeric_features
        
        # For categorical features, we need to get the encoded feature names
        cat_encoder = preprocessor.transformers_[1][1]
        try:
            categorical_feature_names = cat_encoder.get_feature_names_out(categorical_features)
        except:
            # Fallback if get_feature_names_out is not available
            categorical_feature_names = [f"cat_{i}" for i in range(cat_encoder.get_output_dims())]
        
        # Get coefficients (only applicable for linear kernel)
        coefficients = svc.coef_[0]
        
        # Check dimensions before combining
        print(f"Number of features after preprocessing: {len(numeric_feature_names) + len(categorical_feature_names)}")
        print(f"Number of coefficients: {len(coefficients)}")
        
        # Create a list of feature names matching the preprocessor's order
        # This is a simplified approach and may need adjustment based on the actual preprocessor output
        all_feature_names = list(numeric_feature_names) + list(categorical_feature_names)
        
        # In case of dimension mismatch, use indices instead of names
        if len(all_feature_names) != len(coefficients):
            all_feature_names = [f"Feature_{i}" for i in range(len(coefficients))]
        
        # Create feature importance DataFrame
        feature_importance = pd.DataFrame({
            'Feature': all_feature_names[:len(coefficients)],
            'Importance': np.abs(coefficients)
        }).sort_values('Importance', ascending=False)
        
        # Plot top features
        plt.figure(figsize=(12, 8))
        top_n = min(20, len(feature_importance))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(top_n))
        plt.title(f'Top {top_n} Features (Linear SVM)')
        plt.tight_layout()
        plt.savefig('svm_feature_importance.png')
        plt.close()
    except Exception as e:
        print(f"Error in feature importance analysis: {e}")

Number of features after preprocessing: 268
Number of coefficients: 268


In [27]:
# Generate detailed classification reports for all kernels
print("\n--- Detailed Classification Reports ---")
for kernel in kernels:
    best_C = best_models[kernel]['C']
    best_preds = best_models[kernel]['predictions']
    
    print(f"\n{kernel.upper()} KERNEL (C={best_C})")
    print(f"Accuracy: {best_models[kernel]['accuracy']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, best_preds))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, best_preds)
    print(cm)
    
    # Calculate sensitivity and specificity
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    print(f"Sensitivity (True Positive Rate): {sensitivity:.4f}")
    print(f"Specificity (True Negative Rate): {specificity:.4f}")


--- Detailed Classification Reports ---

LINEAR KERNEL (C=100)
Accuracy: 0.9009

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90     29752
           1       0.91      0.89      0.90     29751

    accuracy                           0.90     59503
   macro avg       0.90      0.90      0.90     59503
weighted avg       0.90      0.90      0.90     59503


Confusion Matrix:
[[27015  2737]
 [ 3157 26594]]
Sensitivity (True Positive Rate): 0.8939
Specificity (True Negative Rate): 0.9080

POLY KERNEL (C=10)
Accuracy: 0.9066

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91     29752
           1       0.91      0.90      0.91     29751

    accuracy                           0.91     59503
   macro avg       0.91      0.91      0.91     59503
weighted avg       0.91      0.91      0.91     59503


Confusion Matrix:
[[27189  2563]
 [ 2992 26759]

In [28]:
# Identify the overall best model
best_kernel = max(best_models, key=lambda k: best_models[k]['accuracy'])
print(f"\n=== OVERALL BEST MODEL ===")
print(f"Kernel: {best_kernel}")
print(f"C value: {best_models[best_kernel]['C']}")
print(f"Accuracy: {best_models[best_kernel]['accuracy']:.4f}")


=== OVERALL BEST MODEL ===
Kernel: rbf
C value: 10
Accuracy: 0.9104
