In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc



In [None]:
# Read age at recruitment file
age_df = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_cov.txt', sep='\s+', header=0)

# Read PRS data and case control file (assuming you already have these dataframes)
df_prs = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/PRS_output/trial04.best', sep=r'\s+', header=0, encoding='ascii', engine='python')
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+', header=0, encoding='ascii', engine='python')


In [None]:
print(age_df)

In [None]:

# Merge age data with PRS data
merged_df = pd.merge(df_prs, df_cc, on=['FID', 'IID'])

# Merge age data with merged_df
merged_df = pd.merge(merged_df, age_df, on=['FID', 'IID'])

# Separate the data into cases and controls
cases = merged_df[merged_df['Phenotypes'] == 1]['PRS']
controls = merged_df[merged_df['Phenotypes'] == 0]['PRS']


In [None]:

# standardize the PRS data
scaler = StandardScaler()
merged_df['PRS_standardized'] = scaler.fit_transform(merged_df['PRS'].values.reshape(-1, 1))

# plot the standardized normal distribution curves with different colors and make it pretty
plt.figure(figsize=(10, 6))
sns.histplot(data=merged_df, x='PRS_standardized', bins=30, hue='Phenotypes', kde=True, stat='density', common_norm=False)
plt.title('Standardized Normal Distribution of PRS for Cases and Controls for UKBiobank Data')
plt.xlabel('Standardized PRS')
plt.ylabel('Density')
plt.legend(title='Group', labels=['Cases', 'Controls'])
plt.show()


In [None]:
# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Phenotypes', y='PRS_standardized', data=merged_df, palette="Set3")
plt.title('Box Plot of Standardized PRS for Cases and Controls in UKBiobank Data')
plt.xlabel('Phenotypes')
plt.ylabel('Standardized PRS')
plt.show()

In [None]:
# Calculate summary statistics for cases and controls
summary_stats = merged_df.groupby('Phenotypes')['PRS_standardized'].agg(['mean', 'median', 'std', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)])
summary_stats.columns = ['Mean', 'Median', 'Standard Deviation', '25th Percentile (Q1)', '75th Percentile (Q3)']
summary_stats['IQR'] = summary_stats['75th Percentile (Q3)'] - summary_stats['25th Percentile (Q1)']

# Display the summary statistics
print(summary_stats)

In [None]:
# Calculate percentiles
merged_df['Percentile'] = pd.qcut(merged_df['PRS_standardized'], 100, labels=False)

# Calculate prevalence of cases and controls in each percentile
percentile_counts = merged_df.groupby(['Percentile', 'Phenotypes']).size().unstack(fill_value=0)
percentile_prevalence = percentile_counts.div(percentile_counts.sum(axis=1), axis=0)

# Create the dot plot
plt.figure(figsize=(18, 10))
sns.stripplot(x='Percentile', y='PRS_standardized', hue='Phenotypes', data=merged_df, dodge=True, palette=['skyblue', 'darkorange'], jitter=0.3, alpha=0.7)

# Set y-axis labels to show prevalence
plt.yticks(ticks=np.arange(-2, 3, 1), labels=[f'{i * 20}%' for i in range(-2, 3)])

plt.title('Dot Plot of PRS Percentiles for Cases and Controls')
plt.xlabel('PRS Percentile')
plt.ylabel('Standardized PRS')
plt.legend(title='Group', loc='upper left', bbox_to_anchor=(1, 1))

plt.show()

In [None]:

## NEXT - trying to make the proportion of individuals vs PRS decile graph

# Calculate the PRS deciles
merged_df['PRS_decile'] = pd.qcut(merged_df['PRS_standardized'], q=10, labels=False, duplicates='drop') 

# counting the number of indv (cases vs controls) by PRS decile and age group
decile_counts = merged_df.groupby(['PRS_decile', 'Age_at_Recruitment', 'Phenotypes']).size().unstack(fill_value=0)


In [None]:
print(merged_df) 

In [None]:
# calculating the PRS deciles
merged_df['PRS_decile'] = pd.qcut(merged_df['PRS_standardized'], q=10, labels=False)

# Define age bins
age_bins = [0, 30, 40, 50, 60, np.inf]
age_labels = ['Under 30', '30-40', '40-50', '50-60', 'Over 60']

# Bin individuals by age
merged_df['Age_Group'] = pd.cut(merged_df['Age_at_Recruitment'], bins=age_bins, labels=age_labels)

# Plotting proportion of cases and controls in PRS deciles by age group
plt.figure(figsize=(12, 6))
sns.barplot(data=merged_df, x='PRS_decile', y='Phenotypes', hue='Age_Group', ci=None)
plt.title('Proportion of Cases and Controls in PRS Deciles by Age Group')
plt.xlabel('PRS Decile')
plt.ylabel('Proportion')
plt.legend(title='Age Group')
plt.show()

In [None]:
#MYOC
# Step 1: Read the file containing the IDs of patients with the MYOC mutation
myoc_patients_df = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Post_PRS/MYOC_ids.txt', header=None, names=['MYOC_ID'])

# Step 2: Merge this information with your existing dataframe to filter out only those patients who have the MYOC mutation
merged_df_myoc = pd.merge(merged_df, myoc_patients_df, left_on='IID', right_on='MYOC_ID', how='inner')

# Step 3: Bin these patients into age groups
merged_df_myoc['Age_Group'] = pd.cut(merged_df_myoc['Age_at_Recruitment'], bins=age_bins, labels=age_labels)

# Step 4: Plot the risk decile plot, color-coding the patients based on their age bins
plt.figure(figsize=(12, 6))
sns.barplot(data=merged_df_myoc, x='PRS_decile', y='Phenotypes', hue='Age_Group', ci=None)
plt.title('Proportion of MYOC Mutation Patients in PRS Deciles by Age Group')
plt.xlabel('PRS Decile')
plt.ylabel('Proportion')
plt.xticks(np.arange(0, 10), ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
plt.legend(title='Age Group')
plt.show()


In [None]:
print(merged_df_myoc)

In [None]:
# Identify MYOC carriers who are cases and controls
myoc_cases = merged_df_myoc[merged_df_myoc['Phenotypes'] == 1]
myoc_controls = merged_df_myoc[merged_df_myoc['Phenotypes'] == 0]


In [None]:
# Bin these patients into age groups
age_bins = [40, 50, 60, 70, 80]
age_labels = [ '40-50', '50-60', '60-70', '70-80']
myoc_cases['Age_Group'] = pd.cut(myoc_cases['Age_at_Recruitment'], bins=age_bins, labels=age_labels, right=False)
myoc_controls['Age_Group'] = pd.cut(myoc_controls['Age_at_Recruitment'], bins=age_bins, labels=age_labels, right=False)

# Count the number of cases and controls in each PRS decile and age group
case_count_data = myoc_cases.groupby(['PRS_decile', 'Age_Group']).size().reset_index(name='Count')
control_count_data = myoc_controls.groupby(['PRS_decile', 'Age_Group']).size().reset_index(name='Count')


In [None]:
# Ensure PRS_decile is treated as categorical to avoid misalignment of bars
case_count_data['PRS_decile'] = pd.Categorical(case_count_data['PRS_decile'], categories=range(0, 11))
control_count_data['PRS_decile'] = pd.Categorical(control_count_data['PRS_decile'], categories=range(0, 11))



In [None]:
print(control_count_data)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=control_count_data, x='PRS_decile', y='Count', hue='Age_Group', ci=None, order=range(0, 11))
plt.title('Proportion of MYOC Mutation Patients in PRS Deciles by Age Group')
plt.xlabel('PRS Decile')
plt.ylabel('Count')
plt.xticks(np.arange(0, 10), ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
plt.legend(title='Age Group')
plt.show()

In [None]:

import matplotlib.ticker as ticker
import os

sns.set_style("white")
# Create a new figure for the combined plots
plt.figure(figsize=(16, 7))  # Increase the width to add more space between the plots

# Plot 1: Cases
plt.subplot(1, 2, 1)
sns.barplot(data=case_count_data, x='PRS_decile', y='Count', hue='Age_Group', palette='Paired', ci=None, order=range(0, 10))
plt.xlabel('PRS Decile', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.xticks(np.arange(0, 10), ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
plt.legend(title='Age Group')
plt.text(-0.05, 1.02, '(A) MYOC Cases', color='black', fontsize=10, va='center', ha='left', transform=plt.gca().transAxes, fontweight='bold')

# Set y-axis limits
plt.ylim(0, 110)

# Plot 2: Controls
plt.subplot(1, 2, 2)
sns.barplot(data=control_count_data, x='PRS_decile', y='Count', hue='Age_Group', palette='Paired',ci=None, order=range(0, 10))
plt.xlabel('PRS Decile',fontsize=15)
plt.ylabel('')
plt.xticks(np.arange(0, 10), ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
plt.legend(title='Age Group')
plt.text(-0.05, 1.02, '(B)  MYOC Controls', color='black', fontsize=10, va='center', ha='left', transform=plt.gca().transAxes, fontweight='bold')


# Set y-axis limits
plt.ylim(0, 110)

# Display the combined figure
plt.tight_layout()
# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'MYOC_age_bins.pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.close()

In [None]:
print(len(myoc_controls))