In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from scipy.stats import ttest_ind
import statistics
import os
from sklearn.utils import resample
from matplotlib.patches import Rectangle


In [None]:
#importing the PRSice-2 output .best file (converetd into .txt in terminal). Here I add the sep to seperate the column names with a comma) 
df_prs = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/PRS_output/trialcheck2.best', sep=r'\s+',header=0, encoding='ascii',engine='python')


In [None]:
#importing the case control file 
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+',header=0, encoding='ascii',engine='python')

In [None]:
#substitute this with MYOC or LHON or APOE4 file 
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/APOE4_IDs.txt'
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/MYOC_ids.txt
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/LHON_IDs.txt

mutation_data = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/LHON_IDs.txt', sep=r'\s+')
print(mutation_data)

In [None]:
age_data = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_cov.txt', sep='\s+', header=0)

In [None]:
print(age_data)

In [None]:
#count_ones = (df_cc['Phenotypes'] == 1).sum()

#print("Number of occurrences of 1 in the 'Phenotype' column:", count_ones)
print(df_cc.head)

In [None]:
4# Thought I should only use data thats used in the regression model
df_prs_filtered = df_prs[df_prs['In_Regression'] == 'Yes']
print(df_prs_filtered.head())

In [None]:
df_prs_filtered = df_prs_filtered.dropna(subset=['PRS'])

In [None]:
# convert PRS column to numeric
df_prs_filtered['PRS'] = pd.to_numeric(df_prs_filtered['PRS'], errors='coerce')

In [None]:
#Merge the dfs
merged_df = pd.merge(df_prs_filtered, df_cc, on=['FID', 'IID'])

In [None]:
# separate the data into cases and controls
cases = merged_df[merged_df['Phenotypes'] == 1]['PRS']
controls = merged_df[merged_df['Phenotypes'] == 0]['PRS']

In [None]:
print(merged_df)

In [None]:
# Step 3: Filter mutation carriers and non-carriers
mutation_carriers = merged_df[merged_df['IID'].isin(mutation_data['IID'])]
non_carriers = merged_df[~merged_df['IID'].isin(mutation_data['IID'])]

# Calculate deciles for both groups
mutation_carriers['PRS_decile'] = pd.qcut(mutation_carriers['PRS'], 10, labels=False) + 1
non_carriers['PRS_decile'] = pd.qcut(non_carriers['PRS'], 10, labels=False) + 1


In [None]:
print(merged_df)

In [None]:

# Step 2: Count total number of mutation carriers
total_mutation_carriers = len(mutation_carriers)

# Step 3: Separate mutation carriers into cases and controls
mutation_carrier_cases = mutation_carriers[mutation_carriers['Phenotypes'] == 1]
mutation_carrier_controls = mutation_carriers[mutation_carriers['Phenotypes'] == 0]

# Count the number of cases and controls among the mutation carriers
num_carrier_cases = len(mutation_carrier_cases)
num_carrier_controls = len(mutation_carrier_controls)

# Output the results
print(f"Total mutation carriers: {total_mutation_carriers}")
print(f"Mutation carrier cases: {num_carrier_cases}")
print(f"Mutation carrier controls: {num_carrier_controls}")


In [None]:

# Function to calculate cumulative risk
def calculate_cumulative_risk(df):
    df = df.merge(age_data, on=['FID', 'IID'])
    df.sort_values('Age_at_Recruitment', inplace=True)
    df['cumulative_cases'] = df['Phenotypes'].cumsum()
    df['cumulative_risk'] = df['cumulative_cases'] / len(df)
    return df


# Function to bootstrap cumulative risk
def bootstrap_cumulative_risk(df, n_bootstrap=1000):
    bootstrapped_risks = []
    for _ in range(n_bootstrap):
        boot_df = df.sample(frac=1, replace=True)
        boot_risk = calculate_cumulative_risk(boot_df)
        bootstrapped_risks.append(boot_risk['cumulative_risk'].values)
    return np.percentile(bootstrapped_risks, [2.5, 97.5], axis=0)

# Calculate cumulative risk and bootstrapped CIs for mutation carriers
top_decile_carriers = mutation_carriers[mutation_carriers['PRS_decile'] == 10]
bottom_decile_carriers = mutation_carriers[mutation_carriers['PRS_decile'] == 1]

top_cumulative_risk_carriers = calculate_cumulative_risk(top_decile_carriers)
ci_top_carriers = bootstrap_cumulative_risk(top_decile_carriers)

bottom_cumulative_risk_carriers = calculate_cumulative_risk(bottom_decile_carriers)
ci_bottom_carriers = bootstrap_cumulative_risk(bottom_decile_carriers)

# Calculate cumulative risk and bootstrapped CIs for non-carriers
top_decile_non_carriers = non_carriers[non_carriers['PRS_decile'] == 10]
bottom_decile_non_carriers = non_carriers[non_carriers['PRS_decile'] == 1]

top_cumulative_risk_non_carriers = calculate_cumulative_risk(top_decile_non_carriers)
ci_top_non_carriers = bootstrap_cumulative_risk(top_decile_non_carriers)

bottom_cumulative_risk_non_carriers = calculate_cumulative_risk(bottom_decile_non_carriers)
ci_bottom_non_carriers = bootstrap_cumulative_risk(bottom_decile_non_carriers)


In [None]:
import os 
# Plot the cumulative risk with bootstrapped CI
plt.figure(figsize=(12, 6))

# Plotting top decile for carriers
sns.lineplot(data=top_cumulative_risk_carriers, x='Age_at_Recruitment', y='cumulative_risk', label='Top PRS Decile (Carriers)')
plt.fill_between(top_cumulative_risk_carriers['Age_at_Recruitment'], ci_top_carriers[0], ci_top_carriers[1], alpha=0.3)

# Plotting bottom decile for carriers
sns.lineplot(data=bottom_cumulative_risk_carriers, x='Age_at_Recruitment', y='cumulative_risk', label='Bottom PRS Decile (Carriers)')
plt.fill_between(bottom_cumulative_risk_carriers['Age_at_Recruitment'], ci_bottom_carriers[0], ci_bottom_carriers[1], alpha=0.3)

# Plotting top decile for non-carriers
sns.lineplot(data=top_cumulative_risk_non_carriers, x='Age_at_Recruitment', y='cumulative_risk', label='Top PRS Decile (Non-Carriers)')
plt.fill_between(top_cumulative_risk_non_carriers['Age_at_Recruitment'], ci_top_non_carriers[0], ci_top_non_carriers[1], alpha=0.3)

# Plotting bottom decile for non-carriers
sns.lineplot(data=bottom_cumulative_risk_non_carriers, x='Age_at_Recruitment', y='cumulative_risk', label='Bottom PRS Decile (Non-Carriers)')
plt.fill_between(bottom_cumulative_risk_non_carriers['Age_at_Recruitment'], ci_bottom_non_carriers[0], ci_bottom_non_carriers[1], alpha=0.3)




plt.xlabel('Age')
plt.ylabel('Cumulative Risk of Glaucoma')
plt.legend()
# Display the combined figure
plt.tight_layout()

# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'LHON_cumulative.pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.show()



In [None]:
# Extract cumulative risk at age 60
def get_cumulative_risk_at_age(df, age=60):
    return df[df['Age_at_Recruitment'] == age]['cumulative_risk'].values[0]

top_cumulative_risk_carriers_60 = get_cumulative_risk_at_age(top_cumulative_risk_carriers)
bottom_cumulative_risk_carriers_60 = get_cumulative_risk_at_age(bottom_cumulative_risk_carriers)

# Calculate difference in cumulative risk at age 60 for mutation carriers
diff_top_bottom_carriers = top_cumulative_risk_carriers_60 - bottom_cumulative_risk_carriers_60

print(f"Difference in cumulative risk at age 60 (Mutation Carriers): {diff_top_bottom_carriers:.4f}")


In [None]:
# Calculate the fold difference
fold_difference = top_cumulative_risk_carriers_60 / bottom_cumulative_risk_carriers_60
print(f"Fold difference in cumulative risk at age 60 (Mutation Carriers): {fold_difference:.4f}")