In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the dataset
Data = pd.read_csv('/kaggle/input/cleaned-genomics/cleaned_data_no_outliers.csv')

In [4]:
output_dir = "/kaggle/working/"

In [5]:
# Create a binary classification target based on LN_IC50
# Lower values indicate higher sensitivity
# Using median as threshold for demonstration purposes
median_ln_ic50 = Data['LN_IC50'].median()
Data['DRUG_SENSITIVITY'] = (Data['LN_IC50'] < median_ln_ic50).astype(int)
print(f"Classification threshold (median LN_IC50): {median_ln_ic50}")
print(f"Class distribution: {Data['DRUG_SENSITIVITY'].value_counts()}")

Classification threshold (median LN_IC50): 3.5575039999999998
Class distribution: DRUG_SENSITIVITY
1    99171
0    99171
Name: count, dtype: int64


In [6]:
# Select features for the model
# Selecting a mix of genomic and cell line features
categorical_features = ['TCGA_DESC', 'GDSC Tissue descriptor 1', 'GDSC Tissue descriptor 2', 
                         'Microsatellite instability Status (MSI)', 'Growth Properties', 
                         'Screen Medium', 'TARGET', 'TARGET_PATHWAY']
numerical_features = ['CNA', 'Gene Expression', 'Methylation', 'Z_SCORE', 'AUC']

# Encode categorical features
encoded_features = pd.DataFrame()
label_encoders = {}

In [7]:
for feature in categorical_features:
    le = LabelEncoder()
    encoded_features[feature] = le.fit_transform(Data[feature])
    label_encoders[feature] = le

# Add numerical features to the dataset
for feature in numerical_features:
    encoded_features[feature] = Data[feature]

In [8]:
# Create feature matrix X and target vector y
X = encoded_features
y = Data['DRUG_SENSITIVITY']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

Training set: 138839 samples
Testing set: 59503 samples


In [9]:
X_train.head().T

Unnamed: 0,184213,125178,141898,62145,55132
TCGA_DESC,31.0,8.0,31.0,17.0,31.0
GDSC Tissue descriptor 1,11.0,1.0,2.0,9.0,11.0
GDSC Tissue descriptor 2,10.0,42.0,43.0,32.0,4.0
Microsatellite instability Status (MSI),1.0,1.0,1.0,1.0,1.0
Growth Properties,2.0,0.0,0.0,0.0,2.0
Screen Medium,1.0,1.0,0.0,0.0,1.0
TARGET,172.0,63.0,163.0,47.0,63.0
TARGET_PATHWAY,21.0,11.0,22.0,7.0,11.0
CNA,1.0,1.0,1.0,1.0,1.0
Gene Expression,1.0,1.0,1.0,1.0,1.0


In [10]:
y_train.head()

184213    1
125178    0
141898    0
62145     0
55132     1
Name: DRUG_SENSITIVITY, dtype: int64

In [11]:
X_test.head().T

Unnamed: 0,175400,173880,147597,188281,113681
TCGA_DESC,16.0,27.0,4.0,9.0,9.0
GDSC Tissue descriptor 1,9.0,16.0,19.0,13.0,13.0
GDSC Tissue descriptor 2,28.0,38.0,14.0,20.0,20.0
Microsatellite instability Status (MSI),1.0,1.0,1.0,1.0,1.0
Growth Properties,0.0,0.0,0.0,0.0,0.0
Screen Medium,0.0,1.0,0.0,0.0,1.0
TARGET,44.0,70.0,7.0,58.0,3.0
TARGET_PATHWAY,2.0,7.0,18.0,4.0,16.0
CNA,1.0,1.0,1.0,1.0,1.0
Gene Expression,1.0,1.0,1.0,1.0,1.0


In [12]:
y_test.head()

175400    1
173880    1
147597    1
188281    0
113681    0
Name: DRUG_SENSITIVITY, dtype: int64

In [13]:
X_train.to_csv(f"{output_dir}/NB_Xtrain.csv", index=False)
print(f"X_train dataset saved to {output_dir}/NB_Xtrain.csv")

X_test.to_csv(f"{output_dir}/NB_Xtest.csv", index=False)
print(f"X_test dataset saved to {output_dir}/NB_Xtest.csv")

y_train.to_csv(f"{output_dir}/NB_ytrain.csv", index=False)
print(f"y_train dataset saved to {output_dir}/NB_ytrain.csv")

y_test.to_csv(f"{output_dir}/NB_ytest.csv", index=False)
print(f"y_test dataset saved to {output_dir}/NB_ytest.csv")

X_train dataset saved to /kaggle/working//NB_Xtrain.csv
X_test dataset saved to /kaggle/working//NB_Xtest.csv
y_train dataset saved to /kaggle/working//NB_ytrain.csv
y_test dataset saved to /kaggle/working//NB_ytest.csv


In [14]:
# For Multinomial NB: Features must be non-negative
# Convert all features to non-negative values for MultinomialNB
X_train_multinomial = X_train.copy()
X_test_multinomial = X_test.copy()

# For features that contain negative values, apply min-max scaling to make them non-negative
for col in X_train_multinomial.columns:
    if X_train_multinomial[col].min() < 0:
        min_val = X_train_multinomial[col].min()
        X_train_multinomial[col] = X_train_multinomial[col] - min_val
        X_test_multinomial[col] = X_test_multinomial[col] - min_val

In [15]:
X_train_multinomial.head().T

Unnamed: 0,184213,125178,141898,62145,55132
TCGA_DESC,31.0,8.0,31.0,17.0,31.0
GDSC Tissue descriptor 1,11.0,1.0,2.0,9.0,11.0
GDSC Tissue descriptor 2,10.0,42.0,43.0,32.0,4.0
Microsatellite instability Status (MSI),1.0,1.0,1.0,1.0,1.0
Growth Properties,2.0,0.0,0.0,0.0,2.0
Screen Medium,1.0,1.0,0.0,0.0,1.0
TARGET,172.0,63.0,163.0,47.0,63.0
TARGET_PATHWAY,21.0,11.0,22.0,7.0,11.0
CNA,1.0,1.0,1.0,1.0,1.0
Gene Expression,1.0,1.0,1.0,1.0,1.0


In [16]:
X_test_multinomial.head().T

Unnamed: 0,175400,173880,147597,188281,113681
TCGA_DESC,16.0,27.0,4.0,9.0,9.0
GDSC Tissue descriptor 1,9.0,16.0,19.0,13.0,13.0
GDSC Tissue descriptor 2,28.0,38.0,14.0,20.0,20.0
Microsatellite instability Status (MSI),1.0,1.0,1.0,1.0,1.0
Growth Properties,0.0,0.0,0.0,0.0,0.0
Screen Medium,0.0,1.0,0.0,0.0,1.0
TARGET,44.0,70.0,7.0,58.0,3.0
TARGET_PATHWAY,2.0,7.0,18.0,4.0,16.0
CNA,1.0,1.0,1.0,1.0,1.0
Gene Expression,1.0,1.0,1.0,1.0,1.0


In [17]:
X_train_multinomial.to_csv(f"{output_dir}/MNB_Xtrain.csv", index=False)
print(f"X_train_multinomial dataset saved to {output_dir}/MNB_Xtrain.csv")

X_test_multinomial.to_csv(f"{output_dir}/MNB_Xtest.csv", index=False)
print(f"X_test_multinomial dataset saved to {output_dir}/MNB_Xtest.csv")

X_train_multinomial dataset saved to /kaggle/working//MNB_Xtrain.csv
X_test_multinomial dataset saved to /kaggle/working//MNB_Xtest.csv


In [18]:
# For Bernoulli NB: Features should be binary
# Binarize the features - here using median as threshold
X_train_bernoulli = X_train.copy()
X_test_bernoulli = X_test.copy()

for col in X_train_bernoulli.columns:
    threshold = X_train_bernoulli[col].median()
    X_train_bernoulli[col] = (X_train_bernoulli[col] > threshold).astype(int)
    X_test_bernoulli[col] = (X_test_bernoulli[col] > threshold).astype(int)

# For Gaussian NB: No specific preprocessing needed as it handles continuous data
X_train_gaussian = X_train.copy()
X_test_gaussian = X_test.copy()

In [19]:
X_train_bernoulli.head().T

Unnamed: 0,184213,125178,141898,62145,55132
TCGA_DESC,1,0,1,0,1
GDSC Tissue descriptor 1,1,0,0,0,1
GDSC Tissue descriptor 2,0,1,1,1,0
Microsatellite instability Status (MSI),0,0,0,0,0
Growth Properties,1,0,0,0,1
Screen Medium,0,0,0,0,0
TARGET,1,0,1,0,0
TARGET_PATHWAY,1,0,1,0,0
CNA,0,0,0,0,0
Gene Expression,0,0,0,0,0


In [20]:
X_test_bernoulli.head().T

Unnamed: 0,175400,173880,147597,188281,113681
TCGA_DESC,0,1,0,0,0
GDSC Tissue descriptor 1,0,1,1,1,1
GDSC Tissue descriptor 2,0,1,0,0,0
Microsatellite instability Status (MSI),0,0,0,0,0
Growth Properties,0,0,0,0,0
Screen Medium,0,0,0,0,0
TARGET,0,0,0,0,0
TARGET_PATHWAY,0,0,1,0,1
CNA,0,0,0,0,0
Gene Expression,0,0,0,0,0


In [21]:
X_train_bernoulli.to_csv(f"{output_dir}/BNB_Xtrain.csv", index=False)
print(f"X_train_bernoulli dataset saved to {output_dir}/BNB_Xtrain.csv")

X_test_bernoulli.to_csv(f"{output_dir}/BNB_Xtest.csv", index=False)
print(f"X_test_bernoulli dataset saved to {output_dir}/BNB_Xtest.csv")

X_train_bernoulli dataset saved to /kaggle/working//BNB_Xtrain.csv
X_test_bernoulli dataset saved to /kaggle/working//BNB_Xtest.csv


In [22]:
# Function to evaluate and visualize model performance
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'{model_name.replace(" ", "_").lower()}_confusion_matrix.png')
    plt.close()
    
    return accuracy, cm

In [23]:
# 1. Multinomial Naive Bayes
mnb = MultinomialNB()
mnb_accuracy, mnb_cm = evaluate_model(mnb, X_train_multinomial, X_test_multinomial, 
                                      y_train, y_test, "Multinomial Naive Bayes")

# 2. Gaussian Naive Bayes
gnb = GaussianNB()
gnb_accuracy, gnb_cm = evaluate_model(gnb, X_train_gaussian, X_test_gaussian, 
                                     y_train, y_test, "Gaussian Naive Bayes")

# 3. Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb_accuracy, bnb_cm = evaluate_model(bnb, X_train_bernoulli, X_test_bernoulli, 
                                     y_train, y_test, "Bernoulli Naive Bayes")


Multinomial Naive Bayes Results:
Accuracy: 0.5824

Confusion Matrix:
[[19012 10740]
 [14111 15640]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.64      0.60     29752
           1       0.59      0.53      0.56     29751

    accuracy                           0.58     59503
   macro avg       0.58      0.58      0.58     59503
weighted avg       0.58      0.58      0.58     59503


Gaussian Naive Bayes Results:
Accuracy: 0.7993

Confusion Matrix:
[[25961  3791]
 [ 8150 21601]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81     29752
           1       0.85      0.73      0.78     29751

    accuracy                           0.80     59503
   macro avg       0.81      0.80      0.80     59503
weighted avg       0.81      0.80      0.80     59503


Bernoulli Naive Bayes Results:
Accuracy: 0.7654

Confusion Matrix:
[[20792  8960]
 [ 5000 24751]]


In [29]:
# Compare the performance of the three models
models = ['Multinomial NB', 'Gaussian NB', 'Bernoulli NB']
accuracies = [mnb_accuracy, gnb_accuracy, bnb_accuracy]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.ylim(0, 1)
plt.title('Accuracy Comparison of Naive Bayes Models')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('nb_model_comparison.png')
plt.close()

  order = pd.unique(vector)


In [30]:
# Visualization of feature importance (taking Gaussian NB as example)
# For Gaussian NB, we can use the variance of each feature for each class
# as a simple measure of feature importance
def plot_feature_importance_gnb(model, feature_names):
    plt.figure(figsize=(12, 8))
    class_variances = []
    
    for i in range(len(model.classes_)):
        class_variances.append(model.var_[i])
    
    # Average variance across classes
    avg_var = np.mean(class_variances, axis=0)
    
    # Create a DataFrame for easier plotting
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': avg_var
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Plot top 10 features
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
    plt.title('Top 10 Features by Variance (Gaussian NB)')
    plt.tight_layout()
    plt.savefig('gnb_feature_importance.png')
    plt.close()

plot_feature_importance_gnb(gnb, X_train.columns)

In [31]:
# Visualizing the distribution of correctly and incorrectly classified samples
def plot_classification_distribution(actual, predicted, feature_data, feature_name):
    plt.figure(figsize=(10, 6))
    
    # Create a DataFrame with the feature values and classification results
    df = pd.DataFrame({
        'Feature': feature_data,
        'Actual': actual,
        'Predicted': predicted,
        'Correct': (actual == predicted)
    })
    
    # Plot the distribution
    sns.histplot(data=df, x='Feature', hue='Correct', multiple='stack')
    plt.title(f'Distribution of Correct and Incorrect Classifications by {feature_name}')
    plt.xlabel(feature_name)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(f'classification_distribution_{feature_name}.png')
    plt.close()

# Example: Plot distribution for Z_SCORE
z_score_index = numerical_features.index('Z_SCORE')
plot_classification_distribution(y_test, gnb.predict(X_test_gaussian), 
                               X_test['Z_SCORE'], 'Z_SCORE')

  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
