In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr
import statsmodels.stats.multitest as smm
from scipy.stats import pearsonr


Part 1

In [2]:
# Load the TSV file
expression_path = 'ERP009868.tsv'
expression_data = pd.read_csv(expression_path, sep='\t', index_col=0)

# Load your metadata (assuming it's also a CSV file)
metadata_path = 'updated_metadata_ERP009868.tsv'
metadata = pd.read_csv(metadata_path, sep='\t', index_col=0)


In [3]:
expression_data = expression_data.T

Part 2

In [4]:
# Calculate variances for each gene (row) across samples in the expression data
variances = expression_data.var(axis=0)  # Use axis=1 if genes are rows

# Select the indices of the top 5,000 most variable genes
top_genes = variances.nlargest(5000).index

# Subset expression_data to include only the 5,000 most variable genes
expression_data_subset = expression_data[top_genes]

In [5]:
# Merge data and metadata on common identifier
expression_data_subset.index.name = 'SampleID'  # Set the name of the index to match the metadata

# Reset index of metadata to have sample names as a column
metadata_reset = metadata.reset_index().rename(columns={'refinebio_accession_code': 'SampleID'})

# Merge expression data with metadata on 'SampleID'
data = expression_data_subset.merge(metadata_reset, on='SampleID')

# Create 'mutation_status' column based on 'refinebio_title'
data['mutation_status'] = data['refinebio_title'].apply(
    lambda x: 'reference' if x == 'Danio rerio' else 'mutated'
)

X = data.drop(columns=['mutation_status'])  # Drop target column; keep gene data
y = data['mutation_status']  # Target variable (e.g., tumor vs normal or clusters)


In [6]:
# Step 2b: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(expression_data_subset, y, test_size=0.3, random_state=42)


In [7]:
# Step 2c: Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = nb_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

     mutated       0.33      0.18      0.23        34
   reference       0.95      0.98      0.96       532

    accuracy                           0.93       566
   macro avg       0.64      0.58      0.60       566
weighted avg       0.91      0.93      0.92       566



In [8]:
# Generate the classification report as a dictionary
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Convert the dictionary to a DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Save the DataFrame as a CSV file
report_df.to_csv('classification_report_naive_bayes.csv')

In [9]:
# Convert predictions to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted'])

# Optionally, include the actual labels from y_test for comparison
y_pred_df['Actual'] = y_test.values  # assuming y_test is a pandas Series or NumPy array

# Save to a CSV file
y_pred_df.to_csv('naive_bayes_predictions.csv', index=False)

print("Predictions saved to 'naive_bayes_predictions.csv'")

Predictions saved to 'naive_bayes_predictions.csv'


In [10]:
# Step 2e: Repeat with clusters as the target if applicable
# Assuming 'Clusters' column in metadata
y_clusters = data['Clusters']
X_train, X_test, y_train, y_test = train_test_split(expression_data_subset, y_clusters, test_size=0.3, random_state=42)
nb_model.fit(X_train, y_train)

# Predict and evaluate for clusters
y_pred_clusters = nb_model.predict(X_test)
print("Cluster Classification Report:\n", classification_report(y_test, y_pred_clusters))

Cluster Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       326
           1       1.00      1.00      1.00         6
           2       0.57      1.00      0.73         4
           3       0.99      0.90      0.94       140
           4       0.80      0.82      0.81        39
           5       1.00      0.50      0.67         4
           6       0.87      1.00      0.93        47

    accuracy                           0.94       566
   macro avg       0.88      0.88      0.86       566
weighted avg       0.94      0.94      0.94       566



In [11]:
# Generate the cluster classification report as a dictionary
cluster_report_dict = classification_report(y_test, y_pred_clusters, output_dict=True)

# Convert the dictionary to a DataFrame
cluster_report_df = pd.DataFrame(cluster_report_dict).transpose()

# Save the DataFrame as a CSV file
cluster_report_df.to_csv('cluster_classification_report_naive_bayes.csv')


In [12]:
# Convert cluster predictions to a DataFrame
y_pred_clusters_df = pd.DataFrame(y_pred_clusters, columns=['Predicted_Cluster'])

# Optionally, include the actual cluster labels from y_test for comparison
y_pred_clusters_df['Actual_Cluster'] = y_test.values  # assuming y_test is a pandas Series or NumPy array

# Save to a CSV file
y_pred_clusters_df.to_csv('naive_bayes_cluster_predictions.csv', index=False)

print("Cluster predictions saved to 'naive_bayes_cluster_predictions.csv'")


Cluster predictions saved to 'naive_bayes_cluster_predictions.csv'


In [13]:
# Load the expression data and metadata
expression_path = 'ERP009868.tsv'
expression_data = pd.read_csv(expression_path, sep='\t', index_col=0)

metadata_path = 'updated_metadata_ERP009868.tsv'
metadata = pd.read_csv(metadata_path, sep='\t', index_col=0)

Part 3

In [14]:
# Load each prediction file with its specific column name
def load_prediction(file_path, pred_col):
    df = pd.read_csv(file_path)
    if pred_col in df.columns:
        return df[pred_col]
    else:
        print(f"Warning: Column '{pred_col}' not found in {file_path}.")
        return None

# Group prediction columns based on actual column names
group_predictions = {
    "Naive Bayes (Group)": load_prediction("naive_bayes_predictions.csv", "Predicted"),
    "KNN (Group)": load_prediction("knn_predictions.csv", "predicted"),
    "Logistic Regression (Group)": load_prediction("logistic_regression_predictions.csv", "Predicted_Label"),
    "Random Forest (Group)": load_prediction("rf_predictions.csv", "Predicted Group")
}

# Cluster prediction columns based on actual column names
cluster_predictions = {
    "Naive Bayes (Cluster)": load_prediction("naive_bayes_cluster_predictions.csv", "Predicted_Cluster"),
    "KNN (Cluster)": load_prediction("knn_cluster_predictions.csv", "predicted_cluster"),
    "Logistic Regression (Cluster)": load_prediction("logistic_regression_cluster_predictions.csv", "Predicted_Label"),
    "Random Forest (Cluster)": load_prediction("rf_cluster_predictions.csv", "Predicted Group")
}

# Filter out any None values (i.e., files without the specified column)
group_predictions = {k: v for k, v in group_predictions.items() if v is not None}
cluster_predictions = {k: v for k, v in cluster_predictions.items() if v is not None}

# Convert group and cluster predictions dictionaries to DataFrames
group_predictions_df = pd.DataFrame(group_predictions)
cluster_predictions_df = pd.DataFrame(cluster_predictions)

In [15]:
# Part 3a: Calculate how many models predict each class label for each sample
class_counts = group_predictions_df.apply(lambda row: row.value_counts(), axis=1).fillna(0)
print("Number of models predicting each class label per sample:")
print(class_counts)

# Part 3b: Calculate how many models predict the same cluster for each sample
# Assuming clusters are represented as integers, similar to class labels
# This can be the same as class_counts if we're looking for majority class in each row
cluster_counts = cluster_predictions_df.apply(lambda row: row.value_counts(), axis=1).fillna(0)
print("\nNumber of models predicting the same cluster per sample:")
print(cluster_counts)

# Part 3c: Correlation between stability of cluster and class label prediction
# Calculate the stability score as the maximum count for any predicted label (most common prediction)
# For each row (sample), take the highest value in `class_counts` and `cluster_counts`

class_stability = class_counts.max(axis=1)
cluster_stability = cluster_counts.max(axis=1)

# Calculate Pearson correlation between class and cluster stability
stability_correlation, p_value = pearsonr(class_stability, cluster_stability)

# Apply Bonferroni correction for multiple tests
alpha = 0.05
corrected_p_value = p_value * 2  # Bonferroni correction for 2 tests

print(f"\nStability Correlation between Group and Cluster Predictions: {stability_correlation:.2f}")
print(f"Corrected p-value: {corrected_p_value:.4f}")

if corrected_p_value < alpha:
    print("The stability of the cluster and class label prediction is significantly correlated.")
else:
    print("The stability of the cluster and class label prediction is not significantly correlated.")


Number of models predicting each class label per sample:
     mutated  reference
0        0.0        4.0
1        0.0        4.0
2        1.0        3.0
3        0.0        4.0
4        0.0        4.0
..       ...        ...
561      0.0        2.0
562      0.0        2.0
563      0.0        2.0
564      0.0        2.0
565      0.0        2.0

[566 rows x 2 columns]

Number of models predicting the same cluster per sample:
     0.0    1.0    2.0    3.0    4.0    5.0    6.0    18.0   28.0   30.0   \
0      3.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1      1.0    0.0    1.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0   
2      3.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3      2.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0   
4      2.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
561    1.0    0.0    0.0    0.0   

Part 4

In [None]:
# Convert non-numeric columns to numeric
expression_data = expression_data.select_dtypes(include=['number']).copy()

# Create sample to group mapping
sample_to_group = {}
for _, row in metadata.iterrows():
    sample_name = row['refinebio_accession_code']
    if row['refinebio_title'] == "Danio rerio":
        sample_to_group[sample_name] = 'reference'
    else:
        sample_to_group[sample_name] = 'mutated'

# Function to train Naive Bayes model with specified number of genes
def train_and_evaluate(n_genes):
    # Select the n most variable genes
    var_genes = expression_data.std(axis=1).sort_values(ascending=False).head(n_genes).index
    expression_subset = expression_data.loc[var_genes]
    
    # Prepare the data
    X = expression_subset.T
    y = [sample_to_group[sample] for sample in X.index]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate Naive Bayes model
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    
    # Calculate accuracy
    accuracy = nb_model.score(X_test, y_test)
    
    # Calculate AUC
    y_pred_proba = nb_model.predict_proba(X_test)
    y_test_binary = [1 if label == 'mutated' else 0 for label in y_test]
    auc = roc_auc_score(y_test_binary, y_pred_proba[:, 1])
    
    # Save predictions
    y_pred = nb_model.predict(X_test)
    predictions_df = pd.DataFrame({
        'predicted': y_pred,
        'actual': y_test,
        'prob_mutated': y_pred_proba[:, 1]
    }, index=X_test.index)
    predictions_df.to_csv(f'predictions_{n_genes}_genes.csv')
    
    return accuracy, auc

# Test different numbers of genes
gene_numbers = [10, 100, 1000, 10000]
results = []

for n_genes in gene_numbers:
    accuracy, auc = train_and_evaluate(n_genes)
    results.append({
        'n_genes': n_genes,
        'accuracy': accuracy,
        'auc': auc  # Directly store the AUC without modification
    })

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv('gene_number_results.csv')

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(results_df['n_genes'], results_df['auc'], marker='o', label='AUC')
plt.plot(results_df['n_genes'], results_df['accuracy'], marker='s', label='Accuracy')
plt.xscale('log')
plt.xlabel('Number of Genes')
plt.ylabel('Score')
plt.title('Naive Bayes Performance vs Number of Genes')
plt.legend()
plt.grid(True)
plt.savefig('performance_vs_genes.png')
plt.close()

# Print results
print("\nResults Summary:")
print(results_df.to_string(index=False))

# Calculate performance changes
print("\nPerformance Changes:")
for i in range(1, len(gene_numbers)):
    prev_genes = gene_numbers[i-1]
    curr_genes = gene_numbers[i]
    auc_change = results_df.iloc[i]['auc'] - results_df.iloc[i-1]['auc']
    print(f"\nFrom {prev_genes} to {curr_genes} genes:")
    print(f"AUC change: {auc_change:.4f}")


Part 5

In [None]:
# Load the expression data
expression_data = pd.read_csv('ERP009868.tsv', sep='\t', index_col=0)

# Load the metadata
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

# Create a dictionary to map sample names to their group
sample_to_group = {row['refinebio_accession_code']: 'reference' if row['refinebio_title'] == 'Danio rerio' else 'mutated' 
                   for _, row in metadata.iterrows()}

# Get the most variable genes for each model size
def get_variable_genes(data, n_genes):
    variances = data.var(axis=1).sort_values(ascending=False)
    return variances.head(n_genes).index.tolist()

# Collect genes from different model sizes
all_genes = []
for n_genes in [10, 100, 1000, 10000]:
    genes = get_variable_genes(expression_data, n_genes)
    all_genes.extend(genes)

# Remove duplicates while maintaining order
all_genes = list(dict.fromkeys(all_genes))

# Create expression matrix for selected genes
log2_expression = np.log2(expression_data.loc[all_genes] + 1)

# Create a color mapping
color_map = {'reference': '#2ecc71', 'mutated': '#e74c3c'}  # Green for reference, Red for mutated

# Create a color list for the columns
col_colors = [color_map[sample_to_group[sample]] for sample in log2_expression.columns]

# Create the main figure
plt.figure(figsize=(15, 12))

# Plot the heatmap with dendrograms and side color bar
g = sns.clustermap(log2_expression, 
                   cmap='RdBu_r',
                   col_colors=col_colors,
                   col_cluster=True,  # Enable column clustering
                   row_cluster=True,  # Enable row clustering
                   cbar_kws={'label': 'Log2 Expression'},
                   yticklabels=True,
                   xticklabels=True,
                   dendrogram_ratio=(.1, .2),
                   figsize=(15, 12))

# Rotate x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=45, ha='right')

# Add a legend
legend_elements = [plt.Rectangle((0,0), 1, 1, facecolor=color_map[label], label=label.capitalize())
                  for label in color_map]
g.fig.legend(handles=legend_elements,
            title='Sample Groups',
            loc='center left',
            bbox_to_anchor=(1, 0.5))

# Set the title
g.fig.suptitle('Gene Expression Heatmap of Predictive Modeling Signatures', 
               fontsize=16, y=1.02)

# Add axis labels
g.ax_heatmap.set_xlabel('Samples', fontsize=12)
g.ax_heatmap.set_ylabel('Genes', fontsize=12)

# Adjust the layout and save the figure
plt.tight_layout()
plt.savefig('predictive_signatures_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# Print summary statistics
print('\nHeatmap Generation Summary:')
print(f"Total number of genes: {len(all_genes)}")
print(f"Number of samples: {log2_expression.shape[1]}")
print("\nSample group counts:")
group_counts = pd.Series(sample_to_group.values()).value_counts()
for group, count in group_counts.items():
    print(f"{group}: {count}")




Heatmap Generation Summary:
Total number of genes: 10000
Number of samples: 1886

Sample group counts:
reference: 1766
mutated: 120


<Figure size 1500x1200 with 0 Axes>