In [None]:
import numpy as np
import pandas as pd

# Load the Data
file_name = 'lung_omics_data_for_clusters.csv'
multiomics_data = pd.read_csv(file_name, index_col=0)

# Display the first few rows of the data
print(multiomics_data.head())

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
multiomics_data_scaled = scaler.fit_transform(multiomics_data)

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
multiomics_data_imputed = pd.DataFrame(imputer.fit_transform(multiomics_data), columns=multiomics_data.columns)

# Standardize the data
multiomics_data_scaled = scaler.fit_transform(multiomics_data_imputed)

In [None]:
#Autoencoder
# Step 1: Load the Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Step 2: Load and Preprocess Your Data
data = multiomics_data_imputed


# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Step 3-5: Build the Autoencoder, Train it, and Extract Encoded Features
input_dim = data_scaled.shape[1]
encoding_dim = 2

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(data_scaled, data_scaled, epochs=100, batch_size=32, shuffle=True, validation_split=0.2)

encoder = Model(input_layer, encoded)
encoded_features = encoder.predict(data_scaled)

# Step 6: Perform Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(encoded_features)

# Visualize the results
plt.scatter(encoded_features[:, 0], encoded_features[:, 1], c=clusters, cmap='viridis')
plt.title('Autoencoder Clustering')
plt.show()

In [None]:
from sklearn.cluster import KMeans

# The best clusters=5
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
# Perform clustering
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(multiomics_data_scaled)

# Add cluster labels to the original data
multiomics_data_imputed['Cluster'] = cluster_labels

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('mRNA.csv', index_col=0)

# Convert expression values to numeric
data.iloc[:, 1:] = data.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Group by 'Clusters' and find the top ten expressed genes in each cluster
top_genes_by_cluster = data.groupby('Clusters').apply(lambda x: x.iloc[:, 1:].mean().nlargest(10).index.tolist())

# Organize the results in a dictionary
result_dict = {cluster: top_genes for cluster, top_genes in top_genes_by_cluster.iteritems()}

# Display the result
for cluster, top_genes in result_dict.items():
    print(f"Cluster {cluster}: {', '.join(top_genes)}")

Using gProfiler to convert gene ID to gene name

In [None]:
import pandas as pd

# Load the data
protein_data = pd.read_csv('protein.csv', index_col=0)

# Convert expression values to numeric
protein_data.iloc[:, 1:] = protein_data.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# Group by 'Clusters' and find the top ten expressed proteins in each cluster
top_proteins_by_cluster = protein_data.groupby('Clusters').apply(lambda x: x.iloc[:, 1:].mean().nlargest(10).index.tolist())

# Organize the results in a dictionary
result_dict = {cluster: top_proteins for cluster, top_proteins in top_proteins_by_cluster.iteritems()}

# Display the result
for cluster, top_proteins in result_dict.items():
    print(f"Cluster {cluster}: {', '.join(top_proteins)}")

In [None]:
import pandas as pd

# Load the data
labels_data = pd.read_csv('lung_omics_data_by_cluster.csv', index_col=0)

# Define a dictionary to map clusters to labels
cluster_label_mapping = {'C1': 'LUSC', 'C2': 'LUAD', 'C3': 'LUAD', 'C4':'LUSC','C5': 'LUSC'}

# Add a new column 'label' based on the 'Clusters' column
labels_data['label'] = labels_data['Clusters'].map(cluster_label_mapping)

# Display the updated DataFrame
print(labels_data.head())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the data
data = labels_data

# Separate features and labels
features = data.drop(['label', 'Clusters'], axis=1)
labels = data['label']

# Encode the 'Clusters' column
label_encoder = LabelEncoder()
data['Clusters'] = label_encoder.fit_transform(data['Clusters'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Classifiers
classifiers = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'FFNN': MLPClassifier()
}

# Plot Heatmaps in a 3x2 grid
fig, axes = plt.subplots(3, 2, figsize=(15, 15))

for (label, classifier), ax_row in zip(classifiers.items(), axes):
    for scenario, ax in zip(['Without Holdout', 'With Holdout'], ax_row):
        if scenario == 'Without Holdout':
            # Fit classifiers without holdout
            classifier.fit(features, labels)
            predictions = classifier.predict(features)
            conf_matrix = confusion_matrix(labels, predictions)
            accuracy = accuracy_score(labels, predictions)
            cmap = 'Blues'
        else:
            # Fit classifiers with holdout
            classifier.fit(X_train, y_train)
            predictions = classifier.predict(X_test)
            conf_matrix = confusion_matrix(y_test, predictions)
            accuracy = accuracy_score(y_test, predictions)
            cmap = 'Greens'

        # Plot heatmap
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap=cmap, xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, ax=ax)
        ax.set_title(f'{label} - {scenario}\nAccuracy: {accuracy:.2%}')

plt.tight_layout()
plt.show()