In [None]:
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

N, K, lags = 10, 50, True
directory_path = r"C:\Users\simeo\Documents"
output_path = os.path.join(directory_path, f'final_features_N{N}_K{K}{"_Corr" if lags else ""}.csv')
all_features_df = pd.read_csv(output_path, index_col=0)

In [None]:
# Create DataFrame for PCA-transformed data
lncRNA_df_pca = pd.DataFrame(pca_transformed_data, columns=[f"PC{i+1}" for i in range(pca_transformed_data.shape[1])])
lncRNA_df_pca.index = all_features_df.index

# Normalize PC4 values for coloring
normalized_pc4 = (lncRNA_df_pca['PC4'] - lncRNA_df_pca['PC4'].min()) / (lncRNA_df_pca['PC4'].max() - lncRNA_df_pca['PC4'].min())

# Create an interactive 3D scatter plot with PC4 as the color
fig = px.scatter_3d(
    lncRNA_df_pca,
    x='PC1',
    y='PC2',
    z='PC3',
    color=normalized_pc4,
    color_continuous_scale='Viridis',
    title='Interactive 3D Scatter Plot of PCA Components',
    labels={'PC1': 'PC1', 'PC2': 'PC2', 'PC3': 'PC3', 'color': 'PC4'},
    opacity=0.8
)

# Update layout for better visualization
fig.update_layout(
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'
    ),
    width=800,
    height=800
)

# Show the plot
fig.show()

In [None]:
# Filter columns based on the specified groups
group1_columns = [col for col in all_features_df.columns if any(keyword in col for keyword in ['proportion', 'mean', 'variance', 'skew'])]
group2_columns = [col for col in all_features_df.columns if any(keyword in col for keyword in ['Frequency', 'Magnitude'])]
group3_columns = [col for col in all_features_df.columns if '_x_' in col]

# Create a dictionary to store the groups
groups = {
    "Group 1 (Proportion, Mean, Variance, Skew)": group1_columns,
    "Group 2 (Frequency, Magnitude)": group2_columns,
    "Group 3 (Correlation)": group3_columns,
}

# Initialize PCA and StandardScaler
scaler = StandardScaler()
pca = PCA()

# Loop through each group and generate the plots
for group_name, columns in groups.items():
    # Standardize the data for the current group
    standardized_data = scaler.fit_transform(all_features_df[columns].fillna(0))

    # Perform PCA
    pca_transformed_data = pca.fit_transform(standardized_data)

    # Scree plot
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, 5), pca.explained_variance_ratio_[:4], alpha=0.7, color='blue')
    plt.title(f'Scree Plot for {group_name}')
    plt.xlabel('Principal Component')
    plt.ylabel('Variance Explained')
    plt.xticks(range(1, 5))
    plt.show()

    # PC loadings plot
    pca_components_df = pd.DataFrame(pca.components_[:4], columns=columns, index=[f"PC{i+1}" for i in range(4)])
    plt.figure(figsize=(12, 8))
    sns.heatmap(pca_components_df.T, cmap='coolwarm', annot=False, cbar=True)
    plt.title(f'PC Loadings for {group_name}')
    plt.xlabel('Principal Components')
    plt.ylabel('Features')
    plt.show()

In [None]:
scaler = StandardScaler()
pca = PCA()

standardized_data = scaler.fit_transform(all_features_df.fillna(0))
pca_transformed_data = pca.fit_transform(standardized_data)


# Extract the explained variance ratio for the first 4 PCs
explained_variance_ratio = pca.explained_variance_ratio_[:4]
sum_first_4_pcs = explained_variance_ratio.sum()
# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, color='skyblue')
plt.title('PCA Explained Variance Ratio (First 4 PCs)')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid()
plt.show()
print(f"Sum of first 4 PCs: {sum_first_4_pcs:.2f}")

In [None]:
# Create DataFrame for PCA-transformed data
lncRNA_df_pca = pd.DataFrame(pca_transformed_data, columns=[f"PC{i+1}" for i in range(pca_transformed_data.shape[1])])
lncRNA_df_pca.index = all_features_df.index

Cis_Repressive = ["ENST00000626439.2", "ENST00000424094.6", "ENST00000501176.7",
"ENST00000422420.3", "ENST00000597346.1", "ENST00000447911.6", "ENSMUST00000159731.1"]

Cis_Activating  = ["ENST00000417473.7", "ENST00000434063.3", "ENST00000521028.4",
"ENST00000417262.5", "ENST00000630918.1", "ENST00000524165.7"]

H19 = ["ENST00000414790.10", "ENSMUST00000136359.7"]

Malat1 = ["ENST00000534336.4", "ENSMUST00000172812.3"]

Xist = ["ENST00000429829.6", "ENSMUST00000127786.3"]

# Add a new column to indicate the highlight category
lncRNA_df_pca['Highlight'] = 'Other'
lncRNA_df_pca.loc[lncRNA_df_pca.index.isin(Cis_Repressive), 'Highlight'] = 'Cis-Repressive'
lncRNA_df_pca.loc[lncRNA_df_pca.index.isin(Cis_Activating), 'Highlight'] = 'Cis-Activating'
lncRNA_df_pca.loc[lncRNA_df_pca.index.isin(H19), 'Highlight'] = 'H19'
lncRNA_df_pca.loc[lncRNA_df_pca.index.isin(Malat1), 'Highlight'] = 'Malat1'
lncRNA_df_pca.loc[lncRNA_df_pca.index.isin(Xist), 'Highlight'] = 'Xist'

# Plot "Other" points in grayscale first
other_points = lncRNA_df_pca[lncRNA_df_pca['Highlight'] == 'Other']
grayscale_colors = (other_points['PC4'] - other_points['PC4'].min()) / (other_points['PC4'].max() - other_points['PC4'].min())
fig = px.scatter_3d(
    other_points,
    x='PC1',
    y='PC2',
    z='PC3',
    color=grayscale_colors,
    color_continuous_scale='gray',
    opacity=0.8
)

# Overlay highlighted points
highlighted_points = lncRNA_df_pca[lncRNA_df_pca['Highlight'] != 'Other']
highlighted_fig = px.scatter_3d(
    highlighted_points,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Highlight',
    hover_name=highlighted_points.index,
    color_discrete_map={
        'Cis-Repressive': 'red',
        'Cis-Activating': 'green',
        'H19': 'yellow',
        'Malat1': 'blue',
        'Xist': 'purple',
    },
    opacity=0.8
)

# Add highlighted points to the figure
for trace in highlighted_fig.data:
    fig.add_trace(trace)

fig.update_layout(
    legend=dict(
        orientation="h",  # Horizontal orientation
        x=0.5,            # Center the legend horizontally
        xanchor="center", # Anchor the legend at the center
        y=-0.1            # Position the legend below the plot
    ),
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'
    ),
    width=800,
    height=800
)

# Show the plot
fig.show()

In [None]:
import plotly.io as pio

# Specify the output file path
html_output_path = os.path.join(directory_path, f'tsne_interactive_plot_N{N}_K{K}{"_Corr" if lags else ""}.html')

# Save the Plotly figure as an HTML file
pio.write_html(fig, file=html_output_path, auto_open=False)

print(f"Interactive plot saved to {html_output_path}")