# Part 1

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import scipy.cluster.hierarchy as sch
import numpy as np


In [2]:
# Load the TSV file
file_path = 'ERP009868.tsv'
expression_data = pd.read_csv(file_path, sep='\t', index_col=0)

# Load the metadata
metadata_path = 'metadata_ERP009868.tsv'
metadata = pd.read_csv(metadata_path, sep='\t')

# Part 2

In [3]:
# Part A)
# Subset to the 5,000 most variable genes
var_genes = expression_data.std(axis=1).sort_values(ascending=False).head(5000).index
expression_subset = expression_data.loc[var_genes]

# Part B)
# Use Random Forest to separate between mutations and normal

# Create a dictionary to map sample names to their group
sample_to_group = {}
for _, row in metadata.iterrows():
    sample_name = row['refinebio_accession_code']
    if row['refinebio_title'] == "Danio rerio":
        sample_to_group[sample_name] = 'reference'
    else:
        sample_to_group[sample_name] = 'mutated'

# Subset the expression data to the samples with known group labels
X = expression_data.T
y = [sample_to_group[sample] for sample in X.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)

results = pd.DataFrame({'Actual Group': y_test, 'Predicted Group': y_pred})
results.to_csv('rf_predictions.csv', index=False)

Accuracy: 0.99
Confusion Matrix:
[[ 21   3]
 [  0 354]]


In [4]:
# Part E)
# Run Gaussian Mixture Models from previous assignment
# Run Gaussian Mixture Models
scaler = MinMaxScaler()
expression_scaled = scaler.fit_transform(expression_subset.T)

gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(expression_scaled)
gmm_labels = gmm.predict(expression_scaled)

# Split the data into training and testing sets
X = expression_subset.T
y = gmm_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)

results = pd.DataFrame({'Actual Group': y_test, 'Predicted Group': y_pred})
results.to_csv('rf_cluster_predictions.csv', index=False)

Accuracy: 1.00
Confusion Matrix:
[[360   0]
 [  1  17]]


# Part 3

In [5]:
# Load each prediction file with its specific column name
def load_prediction(file_path, pred_col):
    df = pd.read_csv(file_path)
    if pred_col in df.columns:
        return df[pred_col]
    else:
        print(f"Warning: Column '{pred_col}' not found in {file_path}.")
        return None

# Group prediction columns based on actual column names
group_predictions = {
    "Naive Bayes (Group)": load_prediction("naive_bayes_predictions.csv", "Predicted"),
    "KNN (Group)": load_prediction("knn_predictions.csv", "predicted"),
    "Logistic Regression (Group)": load_prediction("logistic_predictions.csv", "Predicted_Label"),
    "Random Forest (Group)": load_prediction("rf_predictions.csv", "Predicted Group")
}

# Cluster prediction columns based on actual column names
cluster_predictions = {
    "Naive Bayes (Cluster)": load_prediction("naive_bayes_cluster_predictions.csv", "Predicted_Cluster"),
    "KNN (Cluster)": load_prediction("knn_cluster_predictions.csv", "predicted_cluster"),
    "Logistic Regression (Cluster)": load_prediction("logistic_cluster_predictions.csv", "Predicted_Label"),
    "Random Forest (Cluster)": load_prediction("rf_cluster_predictions.csv", "Predicted Group")
}

# Filter out any None values (i.e., files without the specified column)
group_predictions = {k: v for k, v in group_predictions.items() if v is not None}
cluster_predictions = {k: v for k, v in cluster_predictions.items() if v is not None}

# Combine group predictions into a DataFrame and calculate class counts
if group_predictions:
    group_matrix = pd.DataFrame(group_predictions)
    group_matrix.index.name = 'SampleID'
    group_class_counts = group_matrix.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
    group_matrix.to_csv("group_prediction_matrix.csv")
    group_class_counts.to_csv("group_class_counts.csv")
    print("\nGroup Prediction Class Counts:")
    print(group_class_counts)

# Combine cluster predictions into a DataFrame and calculate class counts
if cluster_predictions:
    cluster_matrix = pd.DataFrame(cluster_predictions)
    cluster_matrix.index.name = 'SampleID'
    cluster_class_counts = cluster_matrix.apply(pd.Series.value_counts, axis=1).fillna(0).astype(int)
    cluster_matrix.to_csv("cluster_prediction_matrix.csv")
    cluster_class_counts.to_csv("cluster_class_counts.csv")
    print("\nCluster Prediction Class Counts:")
    print(cluster_class_counts)

# Optional: calculate correlation if both group and cluster counts are available
if 'group_class_counts' in locals() and 'cluster_class_counts' in locals():
    combined_counts = group_class_counts.add(cluster_class_counts, fill_value=0)
    stability_correlation = combined_counts.corr().iloc[0, 1]
    print("\nStability Correlation between Group and Cluster Predictions:", stability_correlation)


Group Prediction Class Counts:
          mutant  mutated  reference
SampleID                            
0              0        0          4
1              0        0          4
2              1        0          3
3              0        0          4
4              0        0          4
...          ...      ...        ...
561            0        0          2
562            0        0          2
563            0        0          2
564            0        0          2
565            0        0          2

[566 rows x 3 columns]

Cluster Prediction Class Counts:
          0.0    1.0    2.0    3.0    4.0    5.0    6.0    18.0   28.0   \
SampleID                                                                  
0             3      0      0      0      0      0      0      0      0   
1             1      0      1      0      0      0      1      0      0   
2             3      0      0      0      0      0      0      0      0   
3             2      0      0      0      1      0    

In [6]:
# Section a-c)
# A. How many models predict each class label, for each sample?
group_class_counts = group_class_counts.T
print("\nNumber of models predicting each class label per sample:")
print(group_class_counts)

# B. How many models predict the same cluster, for each sample? 
cluster_class_counts = cluster_class_counts.T 
print("\nNumber of models predicting the same cluster per sample:")
print(cluster_class_counts)

# C. Does the stability of the cluster and class label prediction correlate?
if 'group_class_counts' in locals() and 'cluster_class_counts' in locals():
    # Combine the class and cluster counts into a single DataFrame
    combined_counts = pd.concat([group_class_counts, cluster_class_counts], axis=1)
    
    # Calculate the Pearson correlation coefficient
    stability_correlation, p_value = pearsonr(combined_counts.sum(axis=1), combined_counts.sum(axis=1, numeric_only=True))
    
    # Apply Bonferroni correction for multiple tests
    alpha = 0.05
    corrected_p_value = p_value * 2  # Bonferroni correction for 2 tests
    
    print(f"\nStability Correlation between Group and Cluster Predictions: {stability_correlation:.2f}")
    print(f"Corrected p-value: {corrected_p_value:.4f}")
    
    if corrected_p_value < alpha:
        print("The stability of the cluster and class label prediction is significantly correlated.")
    else:
        print("The stability of the cluster and class label prediction is not significantly correlated.")


Number of models predicting each class label per sample:
SampleID   0    1    2    3    4    5    6    7    8    9    ...  556  557  \
mutant       0    0    1    0    0    0    1    0    0    0  ...    0    0   
mutated      0    0    0    0    0    0    0    0    0    0  ...    0    0   
reference    4    4    3    4    4    4    3    4    4    4  ...    2    2   

SampleID   558  559  560  561  562  563  564  565  
mutant       0    0    0    0    0    0    0    0  
mutated      0    0    0    0    0    0    0    0  
reference    2    2    2    2    2    2    2    2  

[3 rows x 566 columns]

Number of models predicting the same cluster per sample:
SampleID  0    1    2    3    4    5    6    7    8    9    ...  556  557  \
0.0         3    1    3    2    2    3    2    3    3    1  ...    0    0   
1.0         0    0    0    0    0    0    0    0    0    1  ...    0    0   
2.0         0    1    0    0    0    0    0    0    0    0  ...    0    0   
3.0         0    0    0    0   

# Part 4

In [8]:
import pandas as pd

# Retrain each predictive model using different numbers of genes
gene_counts = [10, 100, 1000, 10000]
auc_results = {}

for gene_count in gene_counts:
    print(f"Retraining models using {gene_count} genes...")
    
    # Subset the expression data to the top gene_count most variable genes
    var_genes = expression_data.std(axis=1).sort_values(ascending=False).head(gene_count).index
    expression_subset = expression_data.loc[var_genes]

    # Retrain the models
    group_predictions = {}
    cluster_predictions = {}

    # Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    group_predictions["Naive Bayes (Group)"] = naive_bayes.predict(X_test)
    cluster_predictions["Naive Bayes (Cluster)"] = naive_bayes.predict(expression_scaled)

    # KNN
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    group_predictions["KNN (Group)"] = knn.predict(X_test)
    cluster_predictions["KNN (Cluster)"] = knn.predict(expression_scaled)

    # Logistic Regression
    logreg = LogisticRegression(random_state=42)
    logreg.fit(X_train, y_train)
    group_predictions["Logistic Regression (Group)"] = logreg.predict(X_test)
    cluster_predictions["Logistic Regression (Cluster)"] = logreg.predict(expression_scaled)

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    group_predictions["Random Forest (Group)"] = rf.predict(X_test)
    cluster_predictions["Random Forest (Cluster)"] = rf.predict(expression_scaled)

    # Calculate AUC for each model
    group_auc = {}
    cluster_auc = {}

    for model, y_pred in group_predictions.items():
        group_auc[model] = roc_auc_score(y_test, y_pred)

    for model, y_pred in cluster_predictions.items():
        cluster_auc[model] = roc_auc_score(gmm_labels, y_pred)

    auc_results[gene_count] = {
        "Group": group_auc,
        "Cluster": cluster_auc
    }

# a. How did the number of genes affect the results?
print("\nAnalysis of how the number of genes affected the results:")
for gene_count, results in auc_results.items():
    print(f"Using {gene_count} genes:")
    print("Group Prediction AUC:")
    print(results["Group"])
    print("Cluster Prediction AUC:")
    print(results["Cluster"])
    print()

# b. What is the model performance (AUC) for each of the different versions of the model? 
# Does it increase or decrease as the number of genes included in the model changes?
print("\nSummary of model performance (AUC) for different gene counts:")
model_performance = pd.DataFrame(index=["Naive Bayes", "KNN", "Logistic Regression", "Random Forest"])
for gene_count, results in auc_results.items():
    model_performance[f"{gene_count} genes (Group)"] = [results["Group"][f"Naive Bayes (Group)"],
                                                       results["Group"][f"KNN (Group)"],
                                                       results["Group"][f"Logistic Regression (Group)"],
                                                       results["Group"][f"Random Forest (Group)"]]
    model_performance[f"{gene_count} genes (Cluster)"] = [results["Cluster"][f"Naive Bayes (Cluster)"],
                                                         results["Cluster"][f"KNN (Cluster)"],
                                                         results["Cluster"][f"Logistic Regression (Cluster)"],
                                                         results["Cluster"][f"Random Forest (Cluster)"]]

print(model_performance.round(2))

Retraining models using 10 genes...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Retraining models using 100 genes...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Retraining models using 1000 genes...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Retraining models using 10000 genes...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Analysis of how the number of genes affected the results:
Using 10 genes:
Group Prediction AUC:
{'Naive Bayes (Group)': np.float64(0.9791666666666667), 'KNN (Group)': np.float64(0.8875), 'Logistic Regression (Group)': np.float64(0.8583333333333334), 'Random Forest (Group)': np.float64(0.9722222222222222)}
Cluster Prediction AUC:
{'Naive Bayes (Cluster)': np.float64(0.5), 'KNN (Cluster)': np.float64(0.5), 'Logistic Regression (Cluster)': np.float64(0.8828053898270807), 'Random Forest (Cluster)': np.float64(0.5)}

Using 100 genes:
Group Prediction AUC:
{'Naive Bayes (Group)': np.float64(0.9791666666666667), 'KNN (Group)': np.float64(0.8875), 'Logistic Regression (Group)': np.float64(0.8583333333333334), 'Random Forest (Group)': np.float64(0.9722222222222222)}
Cluster Prediction AUC:
{'Naive Bayes (Cluster)': np.float64(0.5), 'KNN (Cluster)': np.float64(0.5), 'Logistic Regression (Cluster)': np.float64(0.8828053898270807), 'Random Forest (Cluster)': np.float64(0.5)}

Using 1000 genes:
Gr



# Part 5

In [12]:
# Load the expression data
expression_data = pd.read_csv('ERP009868.tsv', sep='\t', index_col=0)

# Load the metadata
metadata = pd.read_csv('metadata_ERP009868.tsv', sep='\t')

# Create a dictionary to map sample names to their group
sample_to_group = {row['refinebio_accession_code']: 'reference' if row['refinebio_title'] == 'Danio rerio' else 'mutated' 
                   for _, row in metadata.iterrows()}

# Get the most variable genes for each model size
def get_variable_genes(data, n_genes):
    variances = data.var(axis=1).sort_values(ascending=False)
    return variances.head(n_genes).index.tolist()

# Collect genes from different model sizes
all_genes = []
for n_genes in [10, 100, 1000, 10000]:
    genes = get_variable_genes(expression_data, n_genes)
    all_genes.extend(genes)

# Remove duplicates while maintaining order
all_genes = list(dict.fromkeys(all_genes))

# Create expression matrix for selected genes
log2_expression = np.log2(expression_data.loc[all_genes] + 1)

# Create a color mapping
color_map = {'reference': '#2ecc71', 'mutated': '#e74c3c'}  # Green for reference, Red for mutated

# Create a color list for the columns
col_colors = [color_map[sample_to_group[sample]] for sample in log2_expression.columns]

# Create the main figure
plt.figure(figsize=(15, 12))

# Plot the heatmap with dendrograms and side color bar
g = sns.clustermap(log2_expression, 
                   cmap='RdBu_r',
                   col_colors=col_colors,
                   col_cluster=True,  # Enable column clustering
                   row_cluster=True,  # Enable row clustering
                   cbar_kws={'label': 'Log2 Expression'},
                   yticklabels=True,
                   xticklabels=True,
                   dendrogram_ratio=(.1, .2),
                   figsize=(15, 12))

# Rotate x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=45, ha='right')

# Add a legend
legend_elements = [plt.Rectangle((0,0), 1, 1, facecolor=color_map[label], label=label.capitalize())
                  for label in color_map]
g.fig.legend(handles=legend_elements,
            title='Sample Groups',
            loc='center left',
            bbox_to_anchor=(1, 0.5))

# Set the title
g.fig.suptitle('Gene Expression Heatmap of Predictive Modeling Signatures', 
               fontsize=16, y=1.02)

# Add axis labels
g.ax_heatmap.set_xlabel('Samples', fontsize=12)
g.ax_heatmap.set_ylabel('Genes', fontsize=12)

# Adjust the layout and save the figure
plt.tight_layout()
plt.savefig('predictive_signatures_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# Print summary statistics
print('\nHeatmap Generation Summary:')
print(f"Total number of genes: {len(all_genes)}")
print(f"Number of samples: {log2_expression.shape[1]}")
print("\nSample group counts:")
group_counts = pd.Series(sample_to_group.values()).value_counts()
for group, count in group_counts.items():
    print(f"{group}: {count}")




Heatmap Generation Summary:
Total number of genes: 10000
Number of samples: 1886

Sample group counts:
reference: 1766
mutated: 120


<Figure size 1500x1200 with 0 Axes>