In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from scipy.stats import ttest_ind
import statistics
from matplotlib.patches import Rectangle


In [None]:
#substitute the df_prs with either male/female or age over/under 60
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/PRS_for_over_60/age_over_60_3.0.best
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/PRS_for_over_60/age_under_60_final.best
#/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/PRS_for_over_60/UKBB_females.best
#mnt/shared_folders/eResearch_glaucoma_project/Sirithi/PRS_for_over_60/UKBB_males.best


In [None]:
#importing the PRSice-2 output .best file (converetd into .txt in terminal). Here I add the sep to seperate the column names with a comma) 

df_prs = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/PRS_for_over_60/age_over_60_3.0.best', sep=r'\s+',header=0, encoding='ascii',engine='python')


In [None]:
#importing the case control file 
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+',header=0, encoding='ascii',engine='python')

In [None]:
#count_ones = (df_cc['Phenotypes'] == 1).sum()

#print("Number of occurrences of 1 in the 'Phenotype' column:", count_ones)
print(df_prs.head)

In [None]:
4# Thought I should only use data thats used in the regression model
df_prs_filtered = df_prs[df_prs['In_Regression'] == 'Yes']
print(df_prs_filtered)

In [None]:
df_prs_filtered = df_prs_filtered.dropna(subset=['PRS'])

In [None]:
# convert PRS column to numeric
df_prs_filtered['PRS'] = pd.to_numeric(df_prs_filtered['PRS'], errors='coerce')

In [None]:
#Merge the dfs
merged_df = pd.merge(df_prs_filtered, df_cc, on=['FID', 'IID'])

In [None]:
print(merged_df)

In [None]:
# separate the data into cases and controls
cases = merged_df[merged_df['Phenotypes'] == 1]['PRS']
controls = merged_df[merged_df['Phenotypes'] == 0]['PRS']

In [None]:
# plotting the normal distribution curves

sns.set_style("darkgrid")

# plotting the normal distribution curves
plt.figure(figsize=(10, 6))
sns.histplot(cases, kde=True, label='Cases', color='orange', stat='density', common_norm=False)
sns.histplot(controls, kde=True, label='Controls', color='dodgerblue', stat='density', common_norm=False)

# Removing axis labels
plt.xlabel('')
plt.ylabel('')

# Adding a grey block with the title in the center
title_text = 'Trial04'
title_box = Rectangle((0, 1.03), 1, 0.05, fill=True, color='lightgrey', alpha=0.5, edgecolor='none', transform=plt.gca().transAxes)
plt.gca().add_patch(title_box)
plt.text(0.5, 1.05, title_text, horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes, fontsize=14)

plt.subplots_adjust(top=0.9)

# Removing the border
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.show()

#plt.savefig('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/Normal_distribution/Trial04.png')

In [None]:
import os
import matplotlib.ticker as ticker
# standardize the PRS data
scaler = StandardScaler()
merged_df['PRS_standardized'] = scaler.fit_transform(merged_df['PRS'].values.reshape(-1, 1))

# plot the standardized normal distribution curves with different colors and make it prety
sns.set_style("white")
plt.figure(figsize=(10, 6))
sns.histplot(data=merged_df, x='PRS_standardized', bins=30, hue='Phenotypes', kde=True, stat='density', common_norm=False)


plt.xlabel('Standardized PRS')
plt.ylabel('Density')

# Set y-axis format to one decimal place and multiples of 0.1
plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter('%.1f'))
plt.gca().yaxis.set_major_locator(ticker.MultipleLocator(0.1))

plt.legend(title='Group', labels=[ 'Cases','Controls'])


# Display the combined figure
plt.show

In [None]:
import numpy as np
import scipy.stats as stats

# Calculate mean and standard deviation for cases and controls
mean_cases = np.mean(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
mean_controls = np.mean(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

std_cases = np.std(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
std_controls = np.std(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

# Calculate sample size for cases and controls
n_cases = len(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
n_controls = len(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

# Calculate pooled standard error of the mean difference
se_mean_diff = np.sqrt((std_cases**2 / n_cases) + (std_controls**2 / n_controls))

# Calculate t-statistic
t_statistic = (mean_cases - mean_controls) / se_mean_diff

# Calculate degrees of freedom
df = n_cases + n_controls - 2

# Calculate two-sided p-value
p_value = stats.t.sf(np.abs(t_statistic), df) * 2

# Calculate 95% confidence interval for the mean difference
diff_ci = stats.t.interval(0.95, df, loc=(mean_cases - mean_controls), scale=se_mean_diff)

# Format the output
mean_diff = mean_cases - mean_controls
output = f"Mean PRS for Cases: {mean_cases:.4f} ± {std_cases:.4f}\n" \
         f"Mean PRS for Controls: {mean_controls:.4f} ± {std_controls:.4f}\n" \
         f"Mean difference (Cases - Controls): {mean_diff:.4f}\n" \
         f"95% CI for the mean difference: {diff_ci}\n" \
         f"P-value: {p_value:.4f}"

print(output)


In [None]:
## NEXT - trying to make the propotion of individuals vs PRS decile graph

In [None]:
# calculating the PRS deciles
merged_df['PRS_decile']= pd.qcut(merged_df['PRS_standardized'], q=10, labels=False)


In [None]:
#just want to see how my df looks like
print (merged_df.head())

In [None]:
# counting hte number of indv (cases vs controls)
decile_counts = merged_df.groupby(['PRS_decile', 'Phenotypes']).size().unstack(fill_value=0)


In [None]:
# calculatin the propitons 
decile_proportions = decile_counts.div(decile_counts.sum(axis=1), axis=0)


In [None]:
plt.figure(figsize=(12, 6))
barplot = sns.barplot(x=decile_proportions.index, y=decile_proportions[1], color='darkorange', label='Cases')
sns.barplot(x=decile_proportions.index, y=decile_proportions[0], color='skyblue', label='Controls', bottom=decile_proportions[1])


for p, case_count in zip(barplot.patches, decile_counts[1]):
    height = p.get_height()
    ymin, ymax = plt.ylim()
    position = ymax - 0.99 * (ymax - ymin) 
    barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{case_count}',
                 ha='center')

# Annotate each bar with separate counts for controls
for p, control_count in zip(barplot.patches, decile_counts[0]):
    height = p.get_height() + decile_proportions[1]
    ymin, ymax = plt.ylim()
    position = ymax - 0.1 * (ymax - ymin) 
    barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{control_count}',
                 ha='center')
    
plt.title('Proportion of Cases and Controls in PRS Deciles with Counts')
plt.xlabel('PRS Decile')
plt.ylabel('Proportion')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

plt.figure(figsize=(18, 10))


In [None]:
## OKay so ROC and AUC - 

X = merged_df[['PRS']]
y = merged_df['Phenotypes']

# Standardize the PRS data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

In [None]:

#ROC curve
fpr, tpr, thresholds = roc_curve(y, X_standardized)

#AUC score
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()



In [None]:
# T-test to see the difference b/w the means
#using ttest_ind because its not paired- cases and controls are independant 

t_statistic, p_value = ttest_ind(cases, controls, equal_var=False)
print(f'T-stat: {t_statistic:.4f}')
print(f'P-value: {p_value:.10g}')
p_value

#confused with the p-value? Thats too perfect 

In [None]:
print (statistics.mean(cases))
print (statistics.mean(controls))