In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import seaborn as sns
import numpy as np
import os 
import matplotlib.lines as mlines



In [None]:
# SNP lists file paths

craig_snp_list = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Compare_PRS/craig_rsid_list.txt', header=None, names=['variant_id'])
combined_snp_list = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/combining_GWAS/combined_snp_list.txt',header=None,names=['variant_id'])

# Read GWAS dataset from compressed text file
gwas_data = pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Test01_withdata/GCST90011766_buildGRCh37.txt.gz', sep='\t', compression='gzip')


In [None]:
POAG_clumped_snplist =  pd.read_csv('/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/clumping/clumped_SNP.txt',  header=None, names=['variant_id']) 

In [None]:
print(combined_snp_list)

In [None]:
# Find unique SNPs in each list
unique_craig_snps = craig_snp_list[~craig_snp_list['variant_id'].isin(combined_snp_list['variant_id'])]
unique_combined_snps = combined_snp_list[~combined_snp_list['variant_id'].isin(craig_snp_list['variant_id'])]

# Find common SNPs
common_snps = craig_snp_list[craig_snp_list['variant_id'].isin(combined_snp_list['variant_id'])]


# Display results
print("Number of unique SNPs in Craig SNP list:", len(unique_craig_snps))
print("Number of unique SNPs in Combined SNP list:", len(unique_combined_snps))
print("Number of common SNPs between Craig and Combined SNP lists:", len(common_snps))

In [None]:
# Filter GWAS dataset based on unique Craig SNP list
unique_craig_snps_in_gwas = gwas_data[gwas_data['variant_id'].isin(unique_craig_snps['variant_id'])]

# Display results
print("Number of unique SNPs in Craig SNP list:", len(unique_craig_snps))
print("Number of unique SNPs in Craig SNP list present in GWAS dataset:", len(unique_craig_snps_in_gwas))

In [None]:
# Select first 10 rows
first_10_unique_snps = unique_craig_snps_in_gwas.head(10)

# Display first 10 unique SNPs
print("First 10 unique SNPs in the GWAS dataset from the Craig SNP list:")
print(first_10_unique_snps)


In [None]:
print(gwas_data)

In [None]:
# Filter GWAS dataset based on SNP lists
snp_positions_craig = gwas_data[gwas_data['variant_id'].isin(craig_snp_list['variant_id'])][['chromosome', 'base_pair_location']]
snp_positions_combined = gwas_data[gwas_data['variant_id'].isin(combined_snp_list['variant_id'])][['chromosome', 'base_pair_location']]


In [None]:
# Comparing SNP p-values

# Drop rows with null values in the 'chromosome' column
gwas_data = gwas_data.dropna(subset=['chromosome'])

# Remove chromosome 23 if it has no data
chromosome_counts = gwas_data['chromosome'].value_counts()
chromosomes_with_data = chromosome_counts[chromosome_counts > 0].index
gwas_data = gwas_data[gwas_data['chromosome'].isin(chromosomes_with_data)]

# Convert chromosome to categorical and sort by chromosome
gwas_data['chromosome'] = gwas_data['chromosome'].astype('category')
gwas_data['chromosome'] = gwas_data['chromosome'].cat.set_categories(gwas_data['chromosome'].unique(), ordered=True)
gwas_data = gwas_data.sort_values('chromosome')

# Calculate -log10(p-value)
gwas_data['minuslog10pvalue'] = -np.log10(gwas_data['p_value'])

# Add a new column to differentiate MTAG and POAG SNPs
gwas_data['category'] = 'Other'
gwas_data.loc[gwas_data['variant_id'].isin(craig_snp_list['variant_id']), 'category'] = 'MTAG SNPs'
gwas_data.loc[gwas_data['variant_id'].isin(combined_snp_list['variant_id']), 'category'] = 'Full-PRS SNPs'

# Filter data to only include MTAG and POAG SNPs
filtered_gwas_data = gwas_data[gwas_data['category'].isin(['MTAG SNPs', 'Full-PRS SNPs'])]

# Custom color palette
custom_palette = {
    'MTAG SNPs': '#87CEEB',  # Sky Blue
    'Full-PRS SNPs': 'darkorange'   # Coral
}#87CEEB

# Create a violin plot split by chromosome and category
plt.figure(figsize=(12, 8))
sns.violinplot(x='chromosome', y='minuslog10pvalue', hue='category', data=filtered_gwas_data, split=True, cut=0, inner=None, palette=custom_palette, alpha = 0.9)

# Add a dotted line at -log10(0.001) which is 3
plt.axhline(y=3, color='gray', linestyle='--')

# Add text label for the dotted line
plt.text(len(gwas_data['chromosome'].unique()) - 1.5, 3, 'p-value 0.001', color='black', ha='right', va='bottom',fontsize = 12)

# Set axis limits
plt.ylim([0, 4])  # Adjust the y-axis limit as needed

# x-axis label
plt.xlabel('Chromosome',fontsize = 12)

# y-axis label
plt.ylabel('-log10(p-value)',fontsize = 12 )

# Modify legend title
plt.legend(title='Category')

# Display the combined figure
plt.tight_layout()

# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'pval_comparison_mtagpoag.pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.close()


In [None]:


# Add a new column to differentiate Craig and POAG SNPs in gwas_data
gwas_data['source'] = 'Other'
gwas_data.loc[gwas_data['variant_id'].isin(craig_snp_list['variant_id']), 'source'] = 'Craig SNPs'
gwas_data.loc[gwas_data['variant_id'].isin(POAG_clumped_snplist['variant_id']), 'source'] = 'POAG SNPs'

# Filter data to only include Craig and POAG SNPs
filtered_gwas_data = gwas_data[gwas_data['source'].isin(['Craig SNPs', 'POAG SNPs'])].copy()

# Convert chromosome to categorical and sort by chromosome
filtered_gwas_data.loc[:, 'chromosome'] = filtered_gwas_data['chromosome'].astype('category')
filtered_gwas_data.loc[:, 'chromosome'] = filtered_gwas_data['chromosome'].cat.set_categories(filtered_gwas_data['chromosome'].unique(), ordered=True)
filtered_gwas_data = filtered_gwas_data.sort_values('chromosome')

# Create a violin plot split by chromosome and source
plt.figure(figsize=(14, 8))
sns.violinplot(x='chromosome', y='base_pair_location', hue='source', data=filtered_gwas_data, split=True, cut=0, inner=None)

# x-axis label
plt.xlabel('Chromosome',fontsize= 16)

# y-axis label
plt.ylabel('Base Pair Location', fontsize= 16)

#

# Show the graph
plt.show()


In [None]:
# Count the total number of SNPs shown in the dot plot
total_snps = len(filtered_gwas_data)
print(f"Total number of SNPs shown in the dot plot: {total_snps}")

In [None]:

# Comparing base-pair locations

# Add a new column to differentiate Craig and POAG SNPs in gwas_data
gwas_data['source'] = 'Other'
gwas_data.loc[gwas_data['variant_id'].isin(craig_snp_list['variant_id']), 'source'] = 'Craig SNPs'
gwas_data.loc[gwas_data['variant_id'].isin(POAG_clumped_snplist['variant_id']), 'source'] = 'Full-PRS SNPs'

# Filter data to only include Craig and POAG SNPs
filtered_gwas_data = gwas_data[gwas_data['source'].isin(['Craig SNPs', 'Full-PRS SNPs'])]

# Convert chromosome to categorical and sort by chromosome
filtered_gwas_data['chromosome'] = filtered_gwas_data['chromosome'].astype('category')
filtered_gwas_data['chromosome'] = filtered_gwas_data['chromosome'].cat.set_categories(
    filtered_gwas_data['chromosome'].unique(), ordered=True)
filtered_gwas_data = filtered_gwas_data.sort_values('chromosome')

custom_palette = {'Craig SNPs': '#87CEEB',   # Light Blue
                  'Full-PRS SNPs': '#FFA500'}  

# Create the plot
plt.figure(figsize=(12, 8))
ax = sns.stripplot(x='chromosome', y='base_pair_location', hue='source', data=filtered_gwas_data,
                   dodge=True, jitter=True, palette=custom_palette, marker='o', size=3, alpha=0.7)

# Custom legend markers with larger size
mtag_legend = mlines.Line2D([], [], color='#87CEEB', marker='o', linestyle='None', markersize=5, label='MTAG SNPs')
full_prs_legend = mlines.Line2D([], [], color='#FFA500', marker='o', linestyle='None', markersize=5, label='Full-PRS SNPs')

# Add the custom legend
ax.legend(handles=[mtag_legend, full_prs_legend], title='')

# x-axis label
plt.xlabel('Chromosome', fontsize=12)

# y-axis label
plt.ylabel('Base Pair Location', fontsize=12)

# Display the combined figure
plt.tight_layout()

# Save the figure
folder_path = '/mnt/shared_folders/eResearch_glaucoma_project/Sirithi/Graph_Images/facet_plots/'
file_path = os.path.join(folder_path, 'BP_location_comparison_mtagpoag.pdf')

# Ensure the folder exists and save the figure
os.makedirs(folder_path, exist_ok=True)
plt.savefig(file_path)

plt.close()
