In [None]:
import pandas as pd
import os

# Read the craig_df file
craig_df = pd.read_csv("/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Compare_PRS/PGS000137_rsid_with_effect_allele.csv",delimiter=',')

print(craig_df)

In [None]:
# Directory where the ukbb files are located
ukbb_directory = "/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Compare_PRS"

## Using MTAG SNP scores to calculate PRS for each individual in UKBB

In [None]:
# Initialize a variable to keep track of whether the columns have been retained
columns_retained = False

# Initialize an empty DataFrame to store the combined data
combined_ukbb_df = pd.DataFrame()

# Iterate over files in the directory
for filename in os.listdir(ukbb_directory):
    if filename.startswith("ukbb_snp") and filename.endswith(".raw"):
        # Read the UKBB file
        ukbb_df = pd.read_csv(os.path.join(ukbb_directory, filename), delimiter='\t')
        
        # Check if columns have been retained, if not, keep the first set of specified columns
        if not columns_retained:
            columns_to_keep = ['FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE']
            columns_to_keep.extend([col for col in ukbb_df.columns if col.startswith("rs")])  # Include SNP columns
            ukbb_df = ukbb_df[columns_to_keep]
            columns_retained = True
        else:
            # Drop the specified columns for subsequent files
            ukbb_df = ukbb_df.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE'])
        
        # Concatenate the current UKBB DataFrame with the combined DataFrame
        combined_ukbb_df = pd.concat([combined_ukbb_df, ukbb_df], axis=1)

# Print the combined DataFrame
print(combined_ukbb_df)

In [None]:
# Iterate over each row in craig_df
for index, row in craig_df.iterrows():
    rsid = row['rsID']
    effect_weight = row['effect_weight']
    
    # Check if the rsID exists in ukbb_df columns
    if rsid in combined_ukbb_df.columns:
        # Multiply the entire column in ukbb_df corresponding to the rsid by its effect_weight
        combined_ukbb_df[rsid] *= effect_weight

In [None]:
print(combined_ukbb_df.head)

In [None]:

# Find and remove all columns named 'PRS' from combined_ukbb_df
combined_ukbb_df = combined_ukbb_df.loc[:, ~combined_ukbb_df.columns.str.startswith('PRS')]

# Print the DataFrame without the PRS columns
print(combined_ukbb_df)



In [None]:
# Sum up SNP values for each individual to calculate PRS
combined_ukbb_df['PRS'] = combined_ukbb_df.iloc[:, 6:].sum(axis=1)

# Print the DataFrame with PRS column
print(combined_ukbb_df)


In [None]:
# Extract the SNP values for the first individual (first row)
first_individual_snps = combined_ukbb_df.iloc[0, 6:]

# Print the SNP values for the first individual
print(first_individual_snps)


In [None]:
# Exclude the last row (PRS) and sum up the SNP values for the first individual
sum_of_snps_first_individual = first_individual_snps.iloc[:-1].sum()

# Print the sum of SNP values for the first individual
print("Sum of SNP values for the first individual:", sum_of_snps_first_individual)



In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming prs_values contains your PRS scores
prs_values = combined_ukbb_df['PRS']

# Standardize PRS scores
scaler = StandardScaler()
standardized_prs_values = scaler.fit_transform(prs_values.values.reshape(-1, 1))

# Create a new column for standardized PRS values
combined_ukbb_df['Standardized_PRS'] = standardized_prs_values

# Print the DataFrame with the new standardized PRS column
print(combined_ukbb_df)


In [None]:
#saving standerdized_PRS into a file for william in the glaucoma folder 

saving_prs = combined_ukbb_df[['IID', 'Standardized_PRS']]
saving_prs.to_csv('/mnt/shared_folders/eResearch_glaucoma_project/Craig_PRS.txt', index=False)



In [None]:
#importing the case control file 
df_cc = pd.read_table('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/16_02_new_case_control_and_cov_file/new_case_control.txt', sep=r'\s+',header=0, encoding='ascii',engine='python')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# Merge the dataframes on FID and IID
merged_df = pd.merge(combined_ukbb_df, df_cc, on=['FID', 'IID'])

In [None]:
# separate the data into cases and controls
cases = merged_df[merged_df['Phenotypes'] == 1]['PRS']
controls = merged_df[merged_df['Phenotypes'] == 0]['PRS']

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.patches import Rectangle

sns.set_style("darkgrid")

# plotting the normal distribution curves
plt.figure(figsize=(10, 6))
sns.histplot(cases, kde=True, label='Cases', color='orange', stat='density', common_norm=False)
sns.histplot(controls, kde=True, label='Controls', color='dodgerblue', stat='density', common_norm=False)

# Removing axis labels
plt.xlabel('')
plt.ylabel('')

# Adding a grey block with the title in the center
title_text = 'Normal'
title_box = Rectangle((0, 1.03), 1, 0.05, fill=True, color='lightgrey', alpha=0.5, edgecolor='none', transform=plt.gca().transAxes)
plt.gca().add_patch(title_box)
plt.text(0.5, 1.05, title_text, horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes, fontsize=14)

plt.subplots_adjust(top=0.9)

# Removing the border
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.show()

#plt.savefig('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/Normal_distribution/Untreated_diagnosed.png')

In [None]:
# standardize the PRS data
scaler = StandardScaler()
merged_df['PRS_standardized'] = scaler.fit_transform(merged_df['PRS'].values.reshape(-1, 1))

# plot the standardized normal distribution curves with different colors and make it prety
plt.figure(figsize=(10, 6))
sns.histplot(data=merged_df, x='PRS_standardized', bins=30, hue='Phenotypes', kde=True, stat='density', common_norm=False)

plt.title('Standardized Normal Distribution of PRS for Cases and Controls for UKBiobank Data')
plt.xlabel('Standardized PRS')
plt.ylabel('Density')
plt.legend(title='Group', labels=[ 'Cases','Controls'])
plt.show()

In [None]:
# calculating the PRS deciles
merged_df['PRS_decile']= pd.qcut(merged_df['PRS_standardized'], q=10, labels=False)


In [None]:
#just want to see how my df looks like
print (merged_df.head())

In [None]:
# counting hte number of indv (cases vs controls)
decile_counts = merged_df.groupby(['PRS_decile', 'Phenotypes']).size().unstack(fill_value=0)


In [None]:
# calculatin the propitons 
decile_proportions = decile_counts.div(decile_counts.sum(axis=1), axis=0)


In [None]:
import numpy as np
import scipy.stats as stats

# Calculate mean and standard deviation for cases and controls
mean_cases = np.mean(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
mean_controls = np.mean(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

std_cases = np.std(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
std_controls = np.std(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

# Calculate sample size for cases and controls
n_cases = len(merged_df.loc[merged_df['Phenotypes'] == 1, 'PRS_standardized'])
n_controls = len(merged_df.loc[merged_df['Phenotypes'] == 0, 'PRS_standardized'])

# Calculate pooled standard error of the mean difference
se_mean_diff = np.sqrt((std_cases**2 / n_cases) + (std_controls**2 / n_controls))

# Calculate t-statistic
t_statistic = (mean_cases - mean_controls) / se_mean_diff

# Calculate degrees of freedom
df = n_cases + n_controls - 2

# Calculate two-sided p-value
p_value = stats.t.sf(np.abs(t_statistic), df) * 2

# Calculate 95% confidence interval for the mean difference
diff_ci = stats.t.interval(0.95, df, loc=(mean_cases - mean_controls), scale=se_mean_diff)

# Format the output
mean_diff = mean_cases - mean_controls
output = f"Mean PRS for Cases: {mean_cases:.4f} ± {std_cases:.4f}\n" \
         f"Mean PRS for Controls: {mean_controls:.4f} ± {std_controls:.4f}\n" \
         f"Mean difference (Cases - Controls): {mean_diff:.4f}\n" \
         f"95% CI for the mean difference: {diff_ci}\n" \
         f"P-value: {p_value:.4f}"

print(output)


In [None]:
plt.figure(figsize=(12, 6))
barplot = sns.barplot(x=decile_proportions.index, y=decile_proportions[1], color='darkorange', label='Cases')
sns.barplot(x=decile_proportions.index, y=decile_proportions[0], color='skyblue', label='Controls', bottom=decile_proportions[1])


for p, case_count in zip(barplot.patches, decile_counts[1]):
    height = p.get_height()
    ymin, ymax = plt.ylim()
    position = ymax - 0.99 * (ymax - ymin) 
    barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{case_count}',
                 ha='center')

# Annotate each bar with separate counts for controls
for p, control_count in zip(barplot.patches, decile_counts[0]):
    height = p.get_height() + decile_proportions[1]
    ymin, ymax = plt.ylim()
    position = ymax - 0.1 * (ymax - ymin) 
    barplot.text(p.get_x() + p.get_width() / 2,
                 position,
                 f'{control_count}',
                 ha='center')
    
plt.title('Proportion of Cases and Controls in PRS Deciles with Counts')
plt.xlabel('PRS Decile')
plt.ylabel('Proportion')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

plt.figure(figsize=(18, 10))

In [None]:
# Extracting features (PRS) and target (Phenotypes)
X = merged_df[['PRS']]
y = merged_df['Phenotypes']

# Standardizing the PRS data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

In [None]:


# Function to calculate AUC and its confidence interval using bootstrapping
def calculate_auc_with_ci(y, X, n_bootstraps=1000, ci_percentile=95):
    bootstrapped_aucs = []
    rng = np.random.RandomState(42)  # Seed for reproducibility

    for _ in range(n_bootstraps):
        # Bootstrap sampling
        indices = rng.randint(0, len(X), len(X))
        if len(np.unique(y[indices])) < 2:
            # Skip iteration if not enough classes
            continue

        fpr, tpr, _ = roc_curve(y[indices], X[indices])
        roc_auc = auc(fpr, tpr)
        bootstrapped_aucs.append(roc_auc)

    sorted_aucs = np.sort(bootstrapped_aucs)
    lower_bound = np.percentile(sorted_aucs, (100 - ci_percentile) / 2)
    upper_bound = np.percentile(sorted_aucs, 100 - (100 - ci_percentile) / 2)

    # Calculate the actual AUC
    fpr, tpr, _ = roc_curve(y, X)
    actual_auc = auc(fpr, tpr)

    return actual_auc, lower_bound, upper_bound

# Calculate AUC and 95% confidence interval
actual_auc, lower_bound, upper_bound = calculate_auc_with_ci(y, X_standardized)
print(f'AUC: {actual_auc:.2f}')
print(f'95% CI for AUC: ({lower_bound:.2f}, {upper_bound:.2f})')



In [None]:

#ROC curve
fpr, tpr, thresholds = roc_curve(y, X_standardized)

#AUC score
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Extract counts for the first (lowest) and tenth (highest) deciles
cases_1st_decile = decile_counts.loc[decile_counts.index.min(), 1]
controls_1st_decile = decile_counts.loc[decile_counts.index.min(), 0]
cases_10th_decile = decile_counts.loc[decile_counts.index.max(), 1]
controls_10th_decile = decile_counts.loc[decile_counts.index.max(), 0]

# Calculate odds for the first and tenth deciles
odds_1st_decile = cases_1st_decile / controls_1st_decile
odds_10th_decile = cases_10th_decile / controls_10th_decile

# Calculate odds ratio
odds_ratio = odds_10th_decile / odds_1st_decile

# Calculate the standard error (SE) of the log odds ratio
log_odds_1st_decile = np.log(odds_1st_decile)
log_odds_10th_decile = np.log(odds_10th_decile)

SE_log_OR = np.sqrt(1 / cases_1st_decile + 1 / controls_1st_decile + 1 / cases_10th_decile + 1 / controls_10th_decile)

# Calculate the 95% CI for the odds ratio
CI_lower = np.exp(np.log(odds_ratio) - 1.96 * SE_log_OR)
CI_upper = np.exp(np.log(odds_ratio) + 1.96 * SE_log_OR)

print(f'Odds Ratio between the highest and lowest risk deciles: {odds_ratio:.2f}')
print(f'95% CI: [{CI_lower:.2f}, {CI_upper:.2f}]')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from scipy.stats import ttest_ind
import statistics
import os
from sklearn.utils import resample
from matplotlib.patches import Rectangle
import os 

In [None]:
# Create a new figure for the combined plots
sns.set_style("white")
plt.figure(figsize=(21, 6))

# Plot 1: Standardized Normal Distribution of PRS
plt.subplot(1, 3, 1)
scaler = StandardScaler()
merged_df['PRS_standardized'] = scaler.fit_transform(merged_df['PRS'].values.reshape(-1, 1))
sns.histplot(data=merged_df, x='PRS_standardized', bins=30, hue='Phenotypes', kde=True, stat='density', common_norm=False)
plt.xlabel('Standardized PRS', fontsize=16)
plt.ylabel('Density', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(labels=['Cases', 'Controls'], fontsize=12, frameon=False)
plt.text(-0.1, 1, '(A)', color='black', fontsize=16, va='center', ha='left', transform=plt.gca().transAxes, fontweight='bold')

# Plot 2: Proportion of Cases and Controls in PRS Deciles
plt.subplot(1, 3, 2)
barplot = sns.barplot(x=decile_proportions.index, y=decile_proportions[1], color='darkorange', label='Cases')
sns.barplot(x=decile_proportions.index, y=decile_proportions[0], color='skyblue', label='Controls', bottom=decile_proportions[1])

for i, p in enumerate(barplot.patches[:10]):
    height = p.get_height()
    ymin, ymax = plt.ylim()
    position = ymax - 0.99 * (ymax - ymin)
    barplot.text(p.get_x() + p.get_width() / 2, position, f'{decile_counts[1][i]}', ha='center')

for i, p in enumerate(barplot.patches[10:]):
    height = p.get_height() + decile_proportions[1][i]
    ymin, ymax = plt.ylim()
    position = ymax - 0.1 * (ymax - ymin)
    barplot.text(p.get_x() + p.get_width() / 2, position, f'{decile_counts[0][i]}', ha='center')

plt.xticks(np.arange(0, 10), ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
plt.xlabel('PRS Decile', fontsize=16)
plt.ylabel('Proportion', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.text(-0.1, 1, '(B)', color='black', fontsize=16, va='center', ha='left', transform=plt.gca().transAxes, fontweight='bold')

# Custom legend for Cases and Controls
plt.legend([], frameon=False)
plt.text(1.02, 0.1, 'Cases', color='darkorange', fontsize=14, va='center', ha='left', transform=plt.gca().transAxes, rotation=90, fontweight='bold')
plt.text(1.02, 0.8, 'Controls', color='skyblue', fontsize=14, va='center', ha='left', transform=plt.gca().transAxes, rotation=90, fontweight='bold')

# Plot 3: ROC Curve
plt.subplot(1, 3, 3)
fpr, tpr, thresholds = roc_curve(y, X_standardized)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=16)
plt.ylabel('True Positive Rate (TPR)', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.legend(loc='lower right', frameon=False, fontsize=14)  # Increase font size for the legend
plt.text(-0.1, 1, '(C)', color='black', fontsize=16, va='center', ha='left', transform=plt.gca().transAxes, fontweight='bold')

# Display the combined figure
plt.tight_layout()

# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'Craig_scores(MTAG_PRS).pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.close()
