In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from scipy.stats import ttest_ind
import statistics
import os 


In [None]:

# Define the paths to the .best files
files = {
    'IOP>22': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/IOP_over_22mmhg.best',
    'VCDR>0.6': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/VCDRregressed_over_0_6.best',
    'RNFL<23.47': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/rnfl_under_23_47.best',
    'GCL<69.11': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/GCL_under_69_1.best'
}

# Load case control file
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+', header=0, encoding='ascii', engine='python')

# Initialize empty lists to store data
roc_data = []
decile_data = []

# Process each .best file
for label, file in files.items():
    # Load the .best file
    df_prs = pd.read_table(file, sep=r'\s+', header=0, encoding='ascii', engine='python')
    
    # Filter and clean the data
    df_prs_filtered = df_prs[df_prs['In_Regression'] == 'Yes'].dropna(subset=['PRS'])
    df_prs_filtered['PRS'] = pd.to_numeric(df_prs_filtered['PRS'], errors='coerce')
    
    # Merge with case control data
    merged_df = pd.merge(df_prs_filtered, df_cc, on=['FID', 'IID'])
    
    # Standardize the PRS data
    scaler = StandardScaler()
    merged_df['PRS_standardized'] = scaler.fit_transform(merged_df[['PRS']])
    
    # Calculate PRS deciles
    merged_df['PRS_decile'] = pd.qcut(merged_df['PRS_standardized'], q=10, labels=False)
    
    # Add a column to indicate the source of the data
    merged_df['Source'] = label
    
    # Calculate ROC curve
    X = merged_df['PRS_standardized'].values
    y = merged_df['Phenotypes']
    fpr, tpr, thresholds = roc_curve(y, X)
    roc_auc = auc(fpr, tpr)
    
    # Save ROC data for later plotting
    roc_data.append({'label': label, 'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc})
    
    # Save decile data for plotting
    decile_counts = merged_df.groupby(['PRS_decile', 'Phenotypes']).size().unstack(fill_value=0)
    decile_proportions = decile_counts.div(decile_counts.sum(axis=1), axis=0)
    decile_data.append({'label': label, 'decile_counts': decile_counts, 'decile_proportions': decile_proportions})

# Plot ROC curves for each source
plt.figure(figsize=(8, 8))

# Use different markers for better visibility
markers = ['o', 's', '^', 'd']
colors = ['#4169E1', 'orange', '#228B22', '#DC143C']

for i, data in enumerate(roc_data):
    label = data['label']
    fpr = data['fpr']
    tpr = data['tpr']
    roc_auc = data['roc_auc']
    
    plt.plot(fpr, tpr, lw=1, markersize=6, label=f'{label} = {roc_auc:.2f}', markevery=0.1, color=colors[i])

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.legend(loc='lower right')
# Display the combined figure
plt.tight_layout()

# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'endophenotypes_AUC.pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.close()



In [None]:

# Define the paths to the .best files
files = {
    'IOP': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/IOP_over_22mmhg.best',
    'VCDR': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/VCDRregressed_over_0_6.best',
    'RNFL': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/rnfl_under_23_47.best',
    'GCL': '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/PRS_output/GCL_under_69_1.best'
}

# Load case control file
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+', header=0, encoding='ascii', engine='python')

# Initialize empty lists to store data
roc_data = []
decile_data = []

# Process each .best file
for label, file in files.items():
    # Load the .best file
    df_prs = pd.read_table(file, sep=r'\s+', header=0, encoding='ascii', engine='python')
    
    # Filter and clean the data
    df_prs_filtered = df_prs[df_prs['In_Regression'] == 'Yes'].dropna(subset=['PRS'])
    df_prs_filtered['PRS'] = pd.to_numeric(df_prs_filtered['PRS'], errors='coerce')
    
    # Merge with case control data
    merged_df = pd.merge(df_prs_filtered, df_cc, on=['FID', 'IID'])
    
    # Standardize the PRS data
    scaler = StandardScaler()
    merged_df['PRS_standardized'] = scaler.fit_transform(merged_df[['PRS']])
    
    # Calculate PRS deciles
    merged_df['PRS_decile'] = pd.qcut(merged_df['PRS_standardized'], q=10, labels=False)
    
    # Add a column to indicate the source of the data
    merged_df['Source'] = label
    
    # Calculate ROC curve
    X = merged_df['PRS_standardized'].values
    y = merged_df['Phenotypes']
    fpr, tpr, thresholds = roc_curve(y, X)
    roc_auc = auc(fpr, tpr)
    
    # Save ROC data for later plotting
    roc_data.append({'label': label, 'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc})
    
    # Save decile data for plotting
    decile_counts = merged_df.groupby(['PRS_decile', 'Phenotypes']).size().unstack(fill_value=0)
    decile_proportions = decile_counts.div(decile_counts.sum(axis=1), axis=0)
    decile_data.append({'label': label, 'decile_counts': decile_counts, 'decile_proportions': decile_proportions})

# Plot PRS deciles in a 2x2 facet grid with larger figure size
fig, axes = plt.subplots(2, 2, figsize=(20, 16), sharey=True)



for data, ax in zip(decile_data, axes.flatten()):
    label = data['label']
    decile_counts = data['decile_counts']
    decile_proportions = data['decile_proportions']
    
    barplot = sns.barplot(x=decile_proportions.index, y=decile_proportions[1], color='darkorange', label='Cases', ax=ax)
    sns.barplot(x=decile_proportions.index, y=decile_proportions[0], color='skyblue', label='Controls', bottom=decile_proportions[1], ax=ax)
    
    for p, control_count in zip(barplot.patches, decile_counts[0]):
        height = p.get_height() + decile_proportions[1]
        ymin, ymax = plt.ylim()
        position = ymax - 0.1 * (ymax - ymin) 
        barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{control_count}',
                 ha='center',fontsize=12)
    for p, case_count in zip(barplot.patches, decile_counts[1]):
        height = p.get_height()
        ymin, ymax = plt.ylim()
        position = ymax - 0.99 * (ymax - ymin) 
        barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{case_count}',
                 ha='center',fontsize=12)

  
    ax.set_title(f'         {label}', loc='left', fontweight='bold',fontsize=16)
    ax.set_xlabel('PRS Decile',fontsize=16)
    ax.set_ylabel('Proportion',fontsize=16)
    ax.legend().remove()
    
    # Custom legend for Cases and Controls
    plt.legend([], frameon=False)
    plt.text(1.02, 0.1, 'Cases', color='darkorange', fontsize=16, va='center', ha='left', transform=plt.gca().transAxes, rotation=90, fontweight='bold')
    plt.text(1.02, 0.8, 'Controls', color='skyblue', fontsize=16, va='center', ha='left', transform=plt.gca().transAxes, rotation=90, fontweight='bold')



# Display the combined figure
plt.tight_layout()

# Save the figure
#folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
#file_path = os.path.join(folder_path, 'endophenotypes_deciles.pdf')

# Ensure the folder exists and save the figure
#os.makedirs(folder_path, exist_ok=True)
#plt.savefig(file_path)

plt.show()

