In [10]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import scipy.cluster.hierarchy as sch

# Part 1

In [2]:
expression_data = pd.read_csv('ERP009868.tsv', sep='\t')
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

print(expression_data.head())
print(metadata.head())


                 Gene   ERR1046065   ERR1046066   ERR1046067   ERR1046068  \
0  ENSDARG00000000001     0.000000     1.882786     0.970794     0.000000   
1  ENSDARG00000000002   108.934780   171.447810   153.786990   153.121580   
2  ENSDARG00000000018   151.847600   118.113464   133.184840   191.133770   
3  ENSDARG00000000019  2552.638200  1639.096300  1813.370100  1666.400300   
4  ENSDARG00000000068     3.655305     4.331904     3.624755     6.494877   

    ERR1046069   ERR1046070   ERR1046071   ERR1046072   ERR1046073  ...  \
0     0.984327     0.000000     0.000000     0.967930     0.000000  ...   
1   153.099240    76.990480   216.064210   206.836320    96.608505  ...   
2   132.907530   154.294390   179.296080   224.272670   190.119830  ...   
3  1432.987900  1029.325000  1830.156700  1900.549700  1884.948700  ...   
4     5.328189     3.728846     4.406043     2.764331     0.947352  ...   

    ERR999562   ERR999563   ERR999564   ERR999565   ERR999566  ERR999567  \
0    0.000

In [3]:
print(expression_data.columns)
print(metadata.columns)

Index(['Gene', 'ERR1046065', 'ERR1046066', 'ERR1046067', 'ERR1046068',
       'ERR1046069', 'ERR1046070', 'ERR1046071', 'ERR1046072', 'ERR1046073',
       ...
       'ERR999562', 'ERR999563', 'ERR999564', 'ERR999565', 'ERR999566',
       'ERR999567', 'ERR999568', 'ERR999569', 'ERR999570', 'ERR999571'],
      dtype='object', length=1887)
Index(['refinebio_accession_code', 'experiment_accession', 'refinebio_age',
       'refinebio_cell_line', 'refinebio_compound',
       'refinebio_developmental_stage', 'refinebio_disease',
       'refinebio_disease_stage', 'refinebio_genetic_information',
       'refinebio_organism', 'refinebio_platform', 'refinebio_processed',
       'refinebio_processor_id', 'refinebio_processor_name',
       'refinebio_processor_version', 'refinebio_race', 'refinebio_sex',
       'refinebio_source_archive_url', 'refinebio_source_database',
       'refinebio_specimen_part', 'refinebio_subject', 'refinebio_time',
       'refinebio_title', 'refinebio_treatment'],
      

# Part 2

In [14]:
# Part A - D

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Load the data
expression_data = pd.read_csv('ERP009868.tsv', sep='\t')
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

# Check the data types of the expression_data columns
print(expression_data.dtypes)

# Convert non-numeric columns to numeric
expression_data = expression_data.select_dtypes(include=['number']).copy()

# Subset to the 5,000 most variable genes
var_genes = expression_data.std(axis=1).sort_values(ascending=False).head(5000).index
expression_subset = expression_data.loc[var_genes]

# Create a dictionary to map sample names to their group
sample_to_group = {}
for _, row in metadata.iterrows():
    sample_name = row['refinebio_accession_code']
    if row['refinebio_title'] == "Danio rerio":
        sample_to_group[sample_name] = 'reference'
    else:
        sample_to_group[sample_name] = 'mutated'

# Subset the expression data to the samples with known group labels
X = expression_subset.T
y = [sample_to_group[sample] for sample in X.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_score = knn.score(X_test, y_test)
print(f'K-Nearest Neighbors Accuracy: {knn_score:.2f}')


Gene           object
ERR1046065    float64
ERR1046066    float64
ERR1046067    float64
ERR1046068    float64
               ...   
ERR999567     float64
ERR999568     float64
ERR999569     float64
ERR999570     float64
ERR999571     float64
Length: 1887, dtype: object
K-Nearest Neighbors Accuracy: 0.99


In [15]:
# Get predictions for test set
y_pred = knn.predict(X_test)

# Create a DataFrame with the predictions and actual values
results_df = pd.DataFrame({
    'predicted': y_pred,
    'actual': y_test
}, index=X_test.index)

# Save to CSV file
results_df.to_csv('knn_predictions.csv')

# Optional: Display first few rows of the results
print("\nFirst few predictions:")
print(results_df.head())


First few predictions:
            predicted     actual
ERR1122721  reference  reference
ERR1122796  reference  reference
ERR1204256  reference  reference
ERR999427   reference  reference
ERR1216370  reference  reference


In [16]:
# Part E

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
scaled_data = scaler.fit_transform(expression_subset.T)

# Range of cluster numbers to test
k_values = range(2, 11)
results = {}

for n_clusters in k_values:
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=42)
    clusters = spectral.fit_predict(scaled_data)
    results[n_clusters] = clusters

# Select the number of clusters based on the results
selected_k = 5

# Retrain the K-Nearest Neighbors model to predict the clusters
X = expression_subset.T
y = results[selected_k]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_score = knn.score(X_test, y_test)
print(f'K-Nearest Neighbors Accuracy (Spectral Clusters): {knn_score:.2f}')



K-Nearest Neighbors Accuracy (Spectral Clusters): 0.97


In [17]:
# Get predictions for test set
y_pred = knn.predict(X_test)

# Create a DataFrame with the predictions and actual cluster labels
cluster_results_df = pd.DataFrame({
    'predicted_cluster': y_pred,
    'actual_cluster': y_test
}, index=X_test.index)

# Save to CSV file
cluster_results_df.to_csv('knn_cluster_predictions.csv')

# Optional: Display first few rows of the results
print("\nFirst few cluster predictions:")
print(cluster_results_df.head())


First few cluster predictions:
            predicted_cluster  actual_cluster
ERR1122721                  0               0
ERR1122796                  2               0
ERR1204256                  0               0
ERR999427                   4               4
ERR1216370                  0               0


# Part 3

In [19]:
# Part A & B

import pandas as pd
import numpy as np

# Load the prediction files
binary_predictions = pd.read_csv('knn_predictions.csv', index_col=0)
cluster_predictions = pd.read_csv('knn_cluster_predictions.csv', index_col=0)

# Create a combined matrix of all predictions
predictions_matrix = pd.DataFrame({
    'binary_pred': binary_predictions['predicted'],
    'binary_actual': binary_predictions['actual'],
    'cluster_pred': cluster_predictions['predicted_cluster'],
    'cluster_actual': cluster_predictions['actual_cluster']
})

# Initialize DataFrame to store results
sample_analysis = pd.DataFrame(index=predictions_matrix.index)

# a. Count predictions for each class label
# For binary classification (reference/mutated)
binary_counts = pd.DataFrame({
    'reference_count': (predictions_matrix['binary_pred'] == 'reference').astype(int),
    'mutated_count': (predictions_matrix['binary_pred'] == 'mutated').astype(int)
})

# For cluster classification (0-4)
for cluster in range(5):  # since selected_k = 5
    sample_analysis[f'cluster_{cluster}_count'] = (predictions_matrix['cluster_pred'] == cluster).astype(int)

# b. Count models predicting the same label/cluster
sample_analysis['binary_correct'] = (predictions_matrix['binary_pred'] == predictions_matrix['binary_actual']).astype(int)
sample_analysis['cluster_correct'] = (predictions_matrix['cluster_pred'] == predictions_matrix['cluster_actual']).astype(int)

# Calculate AUC (in this case, accuracy per sample)
sample_analysis['binary_auc'] = sample_analysis['binary_correct']
sample_analysis['cluster_auc'] = sample_analysis['cluster_correct']

# Add binary counts to final DataFrame
sample_analysis = pd.concat([sample_analysis, binary_counts], axis=1)

# Save results
sample_analysis.to_csv('sample_specific_auc.csv')

# Print summary statistics
print("\nSummary Statistics:")
print(f"Average Binary AUC: {sample_analysis['binary_auc'].mean():.3f}")
print(f"Average Cluster AUC: {sample_analysis['cluster_auc'].mean():.3f}")

# Print first few rows of the detailed analysis
print("\nFirst few rows of sample-specific analysis:")
print(sample_analysis.head())


Summary Statistics:
Average Binary AUC: 0.989
Average Cluster AUC: 0.968

First few rows of sample-specific analysis:
            cluster_0_count  cluster_1_count  cluster_2_count  \
ERR1122721                1                0                0   
ERR1122796                0                0                1   
ERR1204256                1                0                0   
ERR999427                 0                0                0   
ERR1216370                1                0                0   

            cluster_3_count  cluster_4_count  binary_correct  cluster_correct  \
ERR1122721                0                0               1                1   
ERR1122796                0                0               1                0   
ERR1204256                0                0               1                1   
ERR999427                 0                1               1                1   
ERR1216370                0                0               1                1   

   

In [20]:
# Part c

import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests

# Load the sample-specific analysis file
sample_analysis = pd.read_csv('sample_specific_auc.csv', index_col=0)

# Calculate correlation tests
# 1. Pearson correlation
pearson_corr, pearson_pval = stats.pearsonr(
    sample_analysis['binary_auc'],
    sample_analysis['cluster_auc']
)

# 2. Spearman correlation
spearman_corr, spearman_pval = stats.spearmanr(
    sample_analysis['binary_auc'],
    sample_analysis['cluster_auc']
)

# Create array of p-values for multiple test correction
pvalues = np.array([pearson_pval, spearman_pval])

# Perform multiple test correction using Benjamini-Hochberg method
rejected, pvalues_corrected, _, _ = multipletests(pvalues, method='fdr_bh')

# Create results DataFrame
correlation_results = pd.DataFrame({
    'Test': ['Pearson', 'Spearman'],
    'Correlation': [pearson_corr, spearman_corr],
    'Original_P_value': pvalues,
    'Adjusted_P_value': pvalues_corrected,
    'Significant': rejected
})

# Save correlation results
correlation_results.to_csv('correlation_results.csv')

# Print results
print("\nCorrelation Analysis Results:")
print(correlation_results)

# Optional: Create a scatter plot to visualize the correlation
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(sample_analysis['binary_auc'], sample_analysis['cluster_auc'], alpha=0.5)
plt.xlabel('Binary Classification AUC')
plt.ylabel('Cluster Classification AUC')
plt.title('Correlation between Binary and Cluster Prediction Stability')
plt.savefig('correlation_plot.png')
plt.close()

# Print interpretation
print("\nInterpretation:")
for idx, row in correlation_results.iterrows():
    print(f"\n{row['Test']} correlation:")
    print(f"Correlation coefficient: {row['Correlation']:.3f}")
    print(f"Adjusted p-value: {row['Adjusted_P_value']:.3e}")
    print(f"Statistically significant: {row['Significant']}")


Correlation Analysis Results:
       Test  Correlation  Original_P_value  Adjusted_P_value  Significant
0   Pearson     0.276208      4.791801e-08      4.791801e-08         True
1  Spearman     0.276208      4.791801e-08      4.791801e-08         True

Interpretation:

Pearson correlation:
Correlation coefficient: 0.276
Adjusted p-value: 4.792e-08
Statistically significant: True

Spearman correlation:
Correlation coefficient: 0.276
Adjusted p-value: 4.792e-08
Statistically significant: True


In [7]:
import pandas as pd

# Load each prediction file with its specific column name
def load_prediction(file_path, pred_col):
    df = pd.read_csv(file_path)
    if pred_col in df.columns:
        return df[pred_col]
    else:
        print(f"Warning: Column '{pred_col}' not found in {file_path}.")
        return None

# Group prediction columns based on actual column names
group_predictions = {
    "Naive Bayes (Group)": load_prediction("naive_bayes_predictions.csv", "Predicted"),
    "KNN (Group)": load_prediction("knn_predictions.csv", "predicted"),
    "Logistic Regression (Group)": load_prediction("logistic_predictions.csv", "Predicted_Label"),
    "Random Forest (Group)": load_prediction("rf_predictions.csv", "Predicted Group")
}

# Cluster prediction columns based on actual column names
cluster_predictions = {
    "Naive Bayes (Cluster)": load_prediction("naive_bayes_cluster_predictions.csv", "Predicted_Cluster"),
    "KNN (Cluster)": load_prediction("knn_cluster_predictions.csv", "predicted_cluster"),
    "Logistic Regression (Cluster)": load_prediction("logistic_cluster_predictions.csv", "Predicted_Label"),
    "Random Forest (Cluster)": load_prediction("rf_cluster_predictions.csv", "Predicted Group")
}

# Filter out any None values (i.e., files without the specified column)
group_predictions = {k: v for k, v in group_predictions.items() if v is not None}
cluster_predictions = {k: v for k, v in cluster_predictions.items() if v is not None}

# Combine group predictions into a DataFrame and calculate class counts
if group_predictions:
    group_matrix = pd.DataFrame(group_predictions)
    group_matrix.index.name = 'SampleID'
    group_class_counts = group_matrix.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
    group_matrix.to_csv("group_prediction_matrix.csv")
    group_class_counts.to_csv("group_class_counts.csv")
    print("\nGroup Prediction Class Counts:")
    print(group_class_counts)

# Combine cluster predictions into a DataFrame and calculate class counts
if cluster_predictions:
    cluster_matrix = pd.DataFrame(cluster_predictions)
    cluster_matrix.index.name = 'SampleID'
    cluster_class_counts = cluster_matrix.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
    cluster_matrix.to_csv("cluster_prediction_matrix.csv")
    cluster_class_counts.to_csv("cluster_class_counts.csv")
    print("\nCluster Prediction Class Counts:")
    print(cluster_class_counts)

# Optional: calculate correlation if both group and cluster counts are available
if 'group_class_counts' in locals() and 'cluster_class_counts' in locals():
    combined_counts = group_class_counts.add(cluster_class_counts, fill_value=0)
    stability_correlation = combined_counts.corr().iloc[0, 1]
    print("\nStability Correlation between Group and Cluster Predictions:", stability_correlation)


Group Prediction Class Counts:
          mutant  mutated  reference
SampleID                            
0              0        0          4
1              0        0          4
2              1        0          3
3              0        0          4
4              0        0          4
...          ...      ...        ...
561            0        0          2
562            0        0          2
563            0        0          2
564            0        0          2
565            0        0          2

[566 rows x 3 columns]

Cluster Prediction Class Counts:
          0.0    1.0    2.0    3.0    4.0    5.0    6.0    18.0   28.0   \
SampleID                                                                  
0             3      0      0      0      0      0      0      0      0   
1             1      0      1      0      0      0      1      0      0   
2             3      0      0      0      0      0      0      0      0   
3             2      0      0      0      1      0    

In [8]:
import pandas as pd
from scipy.stats import pearsonr

# Use the code from the previous artifact to get the prediction matrices

# A. How many models predict each class label, for each sample?
group_class_counts = group_class_counts.T
print("\nNumber of models predicting each class label per sample:")
print(group_class_counts)

# B. How many models predict the same cluster, for each sample? 
cluster_class_counts = cluster_class_counts.T 
print("\nNumber of models predicting the same cluster per sample:")
print(cluster_class_counts)

# C. Does the stability of the cluster and class label prediction correlate?
if 'group_class_counts' in locals() and 'cluster_class_counts' in locals():
    # Combine the class and cluster counts into a single DataFrame
    combined_counts = pd.concat([group_class_counts, cluster_class_counts], axis=1)
    
    # Calculate the Pearson correlation coefficient
    stability_correlation, p_value = pearsonr(combined_counts.sum(axis=1), combined_counts.sum(axis=1, numeric_only=True))
    
    # Apply Bonferroni correction for multiple tests
    alpha = 0.05
    corrected_p_value = p_value * 2  # Bonferroni correction for 2 tests
    
    print(f"\nStability Correlation between Group and Cluster Predictions: {stability_correlation:.2f}")
    print(f"Corrected p-value: {corrected_p_value:.4f}")
    
    if corrected_p_value < alpha:
        print("The stability of the cluster and class label prediction is significantly correlated.")
    else:
        print("The stability of the cluster and class label prediction is not significantly correlated.")


Number of models predicting each class label per sample:
SampleID   0    1    2    3    4    5    6    7    8    9    ...  556  557  \
mutant       0    0    1    0    0    0    1    0    0    0  ...    0    0   
mutated      0    0    0    0    0    0    0    0    0    0  ...    0    0   
reference    4    4    3    4    4    4    3    4    4    4  ...    2    2   

SampleID   558  559  560  561  562  563  564  565  
mutant       0    0    0    0    0    0    0    0  
mutated      0    0    0    0    0    0    0    0  
reference    2    2    2    2    2    2    2    2  

[3 rows x 566 columns]

Number of models predicting the same cluster per sample:
SampleID  0    1    2    3    4    5    6    7    8    9    ...  556  557  \
0.0         3    1    3    2    2    3    2    3    3    1  ...    0    0   
1.0         0    0    0    0    0    0    0    0    0    1  ...    0    0   
2.0         0    1    0    0    0    0    0    0    0    0  ...    0    0   
3.0         0    0    0    0   

# Part 4

In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# Convert non-numeric columns to numeric
expression_data = expression_data.select_dtypes(include=['number']).copy()

# Create sample to group mapping
sample_to_group = {}
for _, row in metadata.iterrows():
    sample_name = row['refinebio_accession_code']
    if row['refinebio_title'] == "Danio rerio":
        sample_to_group[sample_name] = 'reference'
    else:
        sample_to_group[sample_name] = 'mutated'

# Function to train model with specified number of genes
def train_and_evaluate(n_genes):
    # Select the n most variable genes
    var_genes = expression_data.std(axis=1).sort_values(ascending=False).head(n_genes).index
    expression_subset = expression_data.loc[var_genes]
    
    # Prepare the data
    X = expression_subset.T
    y = [sample_to_group[sample] for sample in X.index]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate model
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    
    # Calculate accuracy
    accuracy = knn.score(X_test, y_test)
    
    # Calculate AUC
    y_pred_proba = knn.predict_proba(X_test)
    y_test_binary = [1 if label == 'mutated' else 0 for label in y_test]
    auc = roc_auc_score(y_test_binary, y_pred_proba[:, 1])
    
    # Save predictions
    y_pred = knn.predict(X_test)
    predictions_df = pd.DataFrame({
        'predicted': y_pred,
        'actual': y_test,
        'prob_mutated': y_pred_proba[:, 1]
    }, index=X_test.index)
    predictions_df.to_csv(f'predictions_{n_genes}_genes.csv')
    
    return accuracy, auc

# Test different numbers of genes
gene_numbers = [10, 100, 1000, 10000]
results = []

for n_genes in gene_numbers:
    accuracy, auc = train_and_evaluate(n_genes)
    results.append({
        'n_genes': n_genes,
        'accuracy': accuracy,
        'auc': 1 - auc
    })

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv('gene_number_results.csv')

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(results_df['n_genes'], results_df['auc'], marker='o', label='AUC')
plt.plot(results_df['n_genes'], results_df['accuracy'], marker='s', label='Accuracy')
plt.xscale('log')
plt.xlabel('Number of Genes')
plt.ylabel('Score')
plt.title('Model Performance vs Number of Genes')
plt.legend()
plt.grid(True)
plt.savefig('performance_vs_genes.png')
plt.close()

# Print results
print("\nResults Summary:")
print(results_df.to_string(index=False))

# Calculate performance changes
print("\nPerformance Changes:")
for i in range(1, len(gene_numbers)):
    prev_genes = gene_numbers[i-1]
    curr_genes = gene_numbers[i]
    auc_change = results_df.iloc[i]['auc'] - results_df.iloc[i-1]['auc']
    print(f"\nFrom {prev_genes} to {curr_genes} genes:")
    print(f"AUC change: {auc_change:.4f}")


Results Summary:
 n_genes  accuracy      auc
      10  0.970899 0.990760
     100  0.986772 0.998352
    1000  0.989418 0.999058
   10000  0.989418 0.999058

Performance Changes:

From 10 to 100 genes:
AUC change: 0.0076

From 100 to 1000 genes:
AUC change: 0.0007

From 1000 to 10000 genes:
AUC change: 0.0000


In [13]:
# Part 4 for everyones models combined

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load data
expression_data = pd.read_csv('ERP009868.tsv', sep='\t', index_col=0)
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

# Create mutation status column if it doesn't exist
if 'mutation_status' not in metadata.columns:
    metadata['mutation_status'] = metadata['refinebio_title'].apply(lambda x: 'reference' if x == 'Danio rerio' else 'mutant')

# Define models including Adam's Random Forest
naive_bayes_model = GaussianNB()
knn_model = KNeighborsClassifier(n_neighbors=5)
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
rf_model = RandomForestClassifier(random_state=42)

# Store AUC scores
auc_scores = {
    'Naive Bayes': {},
    'KNN': {},
    'Logistic Regression': {},
    'Random Forest': {}
}

# Helper function to retrain and evaluate models
def retrain_and_evaluate(model, model_name, gene_count):
    top_genes = expression_data.var(axis=0).nlargest(gene_count).index
    subset_data = expression_data[top_genes].T
    subset_data.index.name = 'refinebio_accession_code'
    merged_data = subset_data.merge(metadata[['refinebio_accession_code', 'mutation_status']],
                                    left_index=True,
                                    right_on='refinebio_accession_code', how='inner')

    if merged_data.empty:
        print(f"No matching samples for gene count {gene_count}.")
        return

    X = merged_data.drop(columns=['refinebio_accession_code', 'mutation_status'])
    y = merged_data['mutation_status'].apply(lambda x: 1 if x == 'mutant' else 0)

    if (y.value_counts() < 2).any():
        print(f"Insufficient samples for {gene_count} genes. Skipping.")
        return

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    model.fit(X_train, y_train)
    
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    auc_scores[model_name][gene_count] = auc_score
    print(f"{model_name} AUC for top {gene_count} genes: {auc_score}")

# Evaluate each model
gene_counts = [10, 100, 1000, 10000]
for n in gene_counts:
    print(f"\nEvaluating models with top {n} genes:")
    retrain_and_evaluate(naive_bayes_model, 'Naive Bayes', n)
    retrain_and_evaluate(knn_model, 'KNN', n)
    retrain_and_evaluate(logistic_model, 'Logistic Regression', n)
    retrain_and_evaluate(rf_model, 'Random Forest', n)

# Summarize AUC results
auc_summary = pd.DataFrame(auc_scores)
print("\nSummary of AUC Scores by Gene Count and Model:")
print(auc_summary)



Evaluating models with top 10 genes:
Insufficient samples for 10 genes. Skipping.
Insufficient samples for 10 genes. Skipping.
Insufficient samples for 10 genes. Skipping.
Insufficient samples for 10 genes. Skipping.

Evaluating models with top 100 genes:
Naive Bayes AUC for top 100 genes: 0.8571428571428572
KNN AUC for top 100 genes: 0.9813664596273293
Logistic Regression AUC for top 100 genes: 1.0
Random Forest AUC for top 100 genes: 1.0

Evaluating models with top 1000 genes:
Naive Bayes AUC for top 1000 genes: 0.5943396226415094
KNN AUC for top 1000 genes: 0.9947169811320755
Logistic Regression AUC for top 1000 genes: 0.991266846361186
Random Forest AUC for top 1000 genes: 1.0

Evaluating models with top 10000 genes:
Naive Bayes AUC for top 10000 genes: 0.5562368972746331
KNN AUC for top 10000 genes: 0.9973794549266247
Logistic Regression AUC for top 10000 genes: 1.0
Random Forest AUC for top 10000 genes: 1.0

Summary of AUC Scores by Gene Count and Model:
       Naive Bayes      

# Part 5

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the expression data
expression_data = pd.read_csv('ERP009868.tsv', sep='\t', index_col=0)

# Load the metadata
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

# Create a dictionary to map sample names to their group
sample_to_group = {row['refinebio_accession_code']: 'reference' if row['refinebio_title'] == 'Danio rerio' else 'mutated' 
                   for _, row in metadata.iterrows()}

# Get the most variable genes for each model size
def get_variable_genes(data, n_genes):
    variances = data.var(axis=1).sort_values(ascending=False)
    return variances.head(n_genes).index.tolist()

# Collect genes from different model sizes
all_genes = []
for n_genes in [10, 100, 1000, 10000]:
    genes = get_variable_genes(expression_data, n_genes)
    all_genes.extend(genes)

# Remove duplicates while maintaining order
all_genes = list(dict.fromkeys(all_genes))

# Create expression matrix for selected genes
log2_expression = np.log2(expression_data.loc[all_genes] + 1)

# Create a color mapping
color_map = {'reference': '#2ecc71', 'mutated': '#e74c3c'}  # Green for reference, Red for mutated

# Create a color list for the columns
col_colors = [color_map[sample_to_group[sample]] for sample in log2_expression.columns]

# Create the main figure
plt.figure(figsize=(15, 12))

# Plot the heatmap with dendrograms and side color bar
g = sns.clustermap(log2_expression, 
                   cmap='RdBu_r',
                   col_colors=col_colors,
                   col_cluster=True,  # Enable column clustering
                   row_cluster=True,  # Enable row clustering
                   cbar_kws={'label': 'Log2 Expression'},
                   yticklabels=True,
                   xticklabels=True,
                   dendrogram_ratio=(.1, .2),
                   figsize=(15, 12))

# Rotate x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=45, ha='right')

# Add a legend
legend_elements = [plt.Rectangle((0,0), 1, 1, facecolor=color_map[label], label=label.capitalize())
                  for label in color_map]
g.fig.legend(handles=legend_elements,
            title='Sample Groups',
            loc='center left',
            bbox_to_anchor=(1, 0.5))

# Set the title
g.fig.suptitle('Gene Expression Heatmap of Predictive Modeling Signatures', 
               fontsize=16, y=1.02)

# Add axis labels
g.ax_heatmap.set_xlabel('Samples', fontsize=12)
g.ax_heatmap.set_ylabel('Genes', fontsize=12)

# Adjust the layout and save the figure
plt.tight_layout()
plt.savefig('predictive_signatures_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# Print summary statistics
print('\nHeatmap Generation Summary:')
print(f"Total number of genes: {len(all_genes)}")
print(f"Number of samples: {log2_expression.shape[1]}")
print("\nSample group counts:")
group_counts = pd.Series(sample_to_group.values()).value_counts()
for group, count in group_counts.items():
    print(f"{group}: {count}")




Heatmap Generation Summary:
Total number of genes: 10000
Number of samples: 1886

Sample group counts:
reference: 1766
mutated: 120


<Figure size 1500x1200 with 0 Axes>