# SPTSSB Haplotype Analyses
* Pipeline adapted from Mary Makarious: https://github.com/GP2code/India-metaGWAS/blob/main/analyses/09_mapt_haplotype_analysis.ipynb

Please run this Jupyter notebook using R as a Kernel.

Variants in filtered phased file: 1,447 variants within SPTSSB locus (+-500kb from GP2 GWAS lead SNP), with MAF > 0.01 and GP2 PD GWAS p < 0.05
- This number is too high for this pipeline. Instead, from this list use only the variants with PD GWAS p < 5x10^-8 and +-5kb of SPTSSB. -> 24 variants
- UPDATE 1: This pipeline is not working for 24 variants. See alternative pipeline named "_test2".
- UPDATE 2: From visually inspecting the LDhap map, there are some potential "minimum" haplotypes (3 variants) that could be tested. Also, I can test 8 variants prioritized by Jeff or ~10 variants in highest LD with lead SNP.

In [None]:
#install.packages("polspline")
#install.packages("https://cran.r-project.org/src/contrib/Archive/rms/rms_6.7-0.tar.gz", repos = NULL, type = "source")
install.packages("haplo.stats")

In [None]:
## Libraries
library(haplo.stats)
library(ggplot2)
library(tidyverse)
seed <- c(17, 53, 1, 40, 37, 0, 62, 56, 5, 52, 12, 1)
set.seed(seed)
seed

In [None]:
packageVersion('haplo.stats')

In [None]:
R.version.string

In [None]:
## Dirs
data_dir='/home/jupyter/SPTSSB_haplotype_analysis/edit/'

## Filenames
ped_file='sptssb_phased_filtered_4.ped'
map_file='sptssb_phased_filtered_4.map'

# .ped - Genotype file, containing:
# Family ID, Individual ID, Paternal ID, Maternal ID, Sex, Phenotype (first 6 columns)
# 1) Extract individuals and SNPs from your master .bed/.bim/.fam PLINK files.
# 2) Keep only SNPs located within the SPTSSB gene region, or your region of interest around it.

# .map - SNP metadata file, containing:
# Chromosome, SNP ID, Genetic distance (set to 0 if unknown), Base-pair position
# 1) Match the SNPs in the .ped file, extracted from your main GWAS dataset (likely via PLINK).
# 2) Make sure they correspond to SNPs located in the SPTSSB region.

## Data

In [None]:
### DATA SPTSSB .ped & .map files
## MAP file
map_data <- read.table(paste0(data_dir, map_file), header = FALSE)
## PED File (Genotype Data)
ped_data <- read.table(paste0(data_dir, ped_file), header = FALSE)
## PCA file
#

# Extract the genotype data (from the 7th column onwards)
geno_data <- ped_data[, 7:ncol(ped_data)]

# The Phenotype (case/control phenotype, binary: 0 = control, 1 = case) is in the 6th column
phenotype <- ped_data[, 6]
# Recode phenotype: 1 -> 0 (Control), 2 -> 1 (Case)
phenotype <- ifelse(phenotype == 1, 0, 1)
# The sex (covariate) is in the 5th column
sex <- ped_data[, 5]


In [None]:
head(map_data)
head(ped_data)

In [None]:
head(geno_data, 2)

In [None]:
cat('outcome',phenotype[1:20], '\n')
cat('covariate', sex[1:20] )

# Exploratory analysis

## Create a Genotype Matrix

In [None]:
## Assign labels to geno data
# Create new column names by appending ".a1" and ".a2" for each SNP

## Add labels to geno matrix
snp_names= map_data$V2
cat('labels :', snp_names)

new_column_names <- c()
for (snp in snp_names) {
  new_column_names <- c(new_column_names, paste0(snp, ".a1"), paste0(snp, ".a2"))
}
# Assign the new column names to the data frame
colnames(geno_data) <- new_column_names


In [None]:
head(geno_data,4)

## 2. Estimate Haplotype Frequency with `haplo.em`

In [None]:
## THIS STEP DOES NOT RUN WITH ~20 VARIANTS!
save.em <- haplo.em(geno=geno_data, locus.label=snp_names, miss.val=c(0,NA))
names(save.em)

In [None]:
print(save.em)

In [None]:
dim(save.em)

## Remark

* The print methods shows the haplotypes and their estimated frequencies, followed by the final log-likelihood
statistic and the lr stat for no LD, which is the likelihood ratio test statistic contrasting the lnlike for the
estimated haplotype frequencies versus the lnlike under the null assuming that alleles from all loci are in
linkage equilibrium.

### Summary method

In [None]:
summary(save.em)#, nlines=7)

### Remark
* The first part of the summary output lists the subject id (row number of input geno matrix), the codes for the haplotypes of each pair, and the posterior probabilities of the haplotype pairs.
* The second part gives a table of the maximum number of pairs of haplotypes per subject, versus the number of pairs used
in the final posterior probabilities. 
*  The haplotype codes remove the clutter of illustrating all the alleles of the haplotypes, but may not be as informative as the actual haplotypes themselves.
* To see the actual haplotypes, use the show.haplo=TRUE option, as in the following example.

In [None]:
# show full haplotypes, instead of codes
summary(save.em, show.haplo=TRUE, nlines=7)

## Haplotype Frequencies by Group Subsets using `haplo.group`

In [None]:
#help(haplo.group)

In [None]:
## Frequencies
# Group 0 is control, group 1 is PD
group.bin=haplo.group(phenotype, geno_data, locus.label=snp_names, miss.val=0)
print(group.bin, nlines=15)

### Remark
* The group.bin object can be very large, depending on the number of possible haplotypes, so only a portion of the output is illustrated above (limited again by nlines). The first section gives a short summary of how many subjects appear in each of the groups. The second section is a table with the following columns:
    *  The first column gives row numbers.
    *  Total are the estimated haplotype frequencies for the entire data set.
    *  The last columns are the estimated haplotype frequencies for the subjects in the levels of the group variable (phenotye.0 and phenotype.1).
    *  Note that some haplotype frequencies have an NA, which appears when the haplotypes do not occur in the subgroups

In [None]:
class(group.bin)

In [None]:
head(group.bin$group.df,3)

In [None]:
cat('Number of controls and cases', group.bin$group.count,'\n')
cat('Number of SPTSSB loci: ', group.bin$n.loci, '\n' )
freq_haplotypes= group.bin$group.df
cat('Total number of SPTSSB haplotypes', dim(freq_haplotypes), '\n')

In [None]:
group.bin$group.count

**Note:**
* A total of 16 haplotypes created from the 8 SPTSSB variants

In [None]:
summary(freq_haplotypes$Total)

In [None]:
freq_haplotypes[freq_haplotypes$Total > 0.01,]

In [None]:
dim(freq_haplotypes)

## Regression Models: `haplo.glm`

### Preparing the data.frame for haplo.glm
* A data.frame must be defined, and this object must contain the trait and other optional covariates, plus a special kind of genotype matrix (`geno.glm

* Below we prepare a genotype matrix, `geno.glm`, and create a data.frame object, `glm.data`, for use in haplo.glm.

In [None]:
# Set up data for haplo.glm, include geno.glm,
# covariates sex and respnose is phenotype (PD)

In [None]:
# Set up data for haplo.glm, include geno.glm,
# covariates age and male, and responses resp and y.bin
geno.glm <- setupGeno(geno_data,  miss.val=c(0,NA), locus.label=snp_names)
attributes(geno.glm)

In [None]:
# Combine Genotype data, covariates and respose for model fit Model fit
glm.data <- data.frame(geno.glm, sex=sex, pd_case=phenotype) # combine geno + pheno + covariates for _glm_ analysis
length(phenotype)
attributes(glm.data)

In [None]:
recoded_geno <- setupGeno(geno_data, locus.label = snp_names)
head(recoded_geno, 5)

In [None]:
head(geno_data, 2)

# Haplotype association analysis with PD (logistic regression)
* Model is: fit.pd <- glm(casepd_ ~ sex + hapN, data = data, family =”binomial”)
* Covariates: `sex`

In [None]:
print(dim(geno.glm))
head(geno.glm, 3)

In [None]:
print(dim(geno.glm))
head(glm.data, 2)

### i) Haplotype analysis: _PD ~ sex + haplotype + 

In [None]:
# Haplotype glm fit plux sex as covariate,
# return model matrix
#fm <- glm(case ~ age + sex + hap1, data = data, family =”binomial”)
fit.pd <- haplo.glm(pd_case ~ sex + geno.glm, family = binomial, data=glm.data, na.action = "na.geno.keep",
                     locus.label=snp_names)

In [None]:
summary(fit.pd)

In [None]:
#help(haplo.glm)

In [None]:
cat('base haplotype index: ', fit.pd$haplo.base, '\n')

In [None]:
cat('Number of common haplotypes',length(fit.pd$haplo.common), '\n')
cat('Common haplotypes', fit.pd$haplo.common)

In [None]:
length(fit.pd$coefficients)

### Get Haplotype Frequencies for Cases and Controls and Compute Confidence Intervals

In [None]:
summary_fit=(summary(fit.pd))

In [None]:
print(dim(fit.pd$haplo.unique))
head(fit.pd$haplo.unique, 3)

In [None]:
# Merge UNIQUE Haplotypes with Haplotype Frequencies
haplo_unique <- fit.pd$haplo.unique   # Matrix of unique haplotypes
haplo_freq <- fit.pd$haplo.freq       # Numeric vector of haplotype frequencies

# Convert haplo_unique to a data frame and assign SNP names as column names
haplo_unique_df <- as.data.frame(haplo_unique, stringsAsFactors = FALSE)
colnames(haplo_unique_df) <- colnames(haplo_unique)

# Add haplotype frequencies as a new column
haplo_unique_df$frequency <- haplo_freq

# View Final data frame
#head(haplo_unique_df,3)
print(dim(haplo_unique_df))

## Add Haplotype frequencies for cases and controls
merged_haplo_data= merge(haplo_unique_df, freq_haplotypes, by= colnames(haplo_unique), all.x = TRUE)
cat('Total number of unique haplotypes: ', dim(merged_haplo_data)[1], '\n')

merged_haplo_data <- merged_haplo_data %>%
  rename(freq_controls = `phenotype=0`, freq_cases = `phenotype=1`)
merged_haplo_data <- merged_haplo_data %>%
    mutate(haplo_index=rownames(merged_haplo_data))

### Compute Confidence Intervals
## First format summary stats
sum_stats <- summary_fit$coefficients
sum_stats <- data.frame(
  Variable = rownames(sum_stats), # Add row names as a new column called "Variable"
  sum_stats,                      # Add the rest of the data
  row.names = NULL                # Remove the row names
)

### Calculate the Odds Ratio and Confidence Intervals
# -Thecoef values represent log-odds (logarithm of the odds ratio) rather than the odds ratios themselves.
# -To interpret them as odds ratios, you need to exponentiate the coefficients.
#- Converting Coefficients to Odds Ratios
sum_stats <- sum_stats %>%
  mutate(
    OR = exp(coef),                            # Odds Ratio
    CI_lower = exp(coef - 1.96 * se),           # Lower 95% CI
    CI_upper = exp(coef + 1.96 * se),            # Upper 95% CI
  )

### Merge the haplotype frequency data with summary stats
## First split the variable column
# Transforming the data
sum_stats <- sum_stats %>%
  mutate(
    haplo_index = ifelse(
      Variable == "(Intercept)", "1001",
      ifelse(Variable == "sex", "1002",
      ifelse(Variable == "geno.glm.rare", "1003",
             as.character(sub(".*\\.(\\d+)$", "\\1", Variable))
      )
    )
  )
)
## merge Haplotype  Frequencies and summary stats
merged_haplo_stats= merge(merged_haplo_data, sum_stats, by='haplo_index', ) 
merged_haplo_stats$OR_95_CI <- paste0(round(merged_haplo_stats$OR,2), " (", round(merged_haplo_stats$CI_lower, 2), "-", round(merged_haplo_stats$CI_upper, 2), ")")

## Apply bonferroni correction to pvalues
ntests= nrow(merged_haplo_stats) # Number of tests (Only contains the cmmon haplotypes)
# Apply Bonferroni correction
merged_haplo_stats$pval_corrected <- pmin(merged_haplo_stats$pval * ntests, 1)
Bonferoni_pval=0.05/ntests
cat('Number of haplotypes with frequency >= 1%: ', ntests, '\n')
cat('Number to association test performed: ', ntests, '\n')
cat('Bonferroni corrected pvalue Threshold: ', round(Bonferoni_pval, 4),  '\n')
cat('Number of haplotypes significant at p < 0.05: ' , dim(merged_haplo_stats[merged_haplo_stats$pval < 0.05])[1],  '\n')
cat('Number of haplotypes significant at p < ', Bonferoni_pval, ': ', dim(merged_haplo_stats[merged_haplo_stats$pval < Bonferoni_pval])[1],  '\n')

##BASE HAPLOTYPE
# ===== haplo.base            A        G         A         C         A      A  0.17716 ===========

In [None]:
cat('Number of haplotypes significant at p < 0.05: ' , dim(merged_haplo_stats[merged_haplo_stats$pval < 0.05])[1],  '\n')
merged_haplo_stats[merged_haplo_stats$pval < 0.05,]

In [None]:
cat('Number of haplotypes significant at p < ', Bonferoni_pval, ': ', dim(merged_haplo_stats[merged_haplo_stats$pval < Bonferoni_pval])[1],  '\n')
merged_haplo_stats[merged_haplo_stats$pval < Bonferoni_pval, ]


In [None]:
# Create 'tag_h11' and 'tag_h4' columns based on 'chr3:161374774:C:T' values (upstream variant prioritized in African and Ryan Corces analysis)
# Step 1: Create tag_h11, tag_h4, and tag_h11c_temp
merged_haplo_stats <- merged_haplo_stats %>%
  mutate(
    tag_h11 = ifelse(`chr3:161374774:C:T` == "C", "H11", NA),
    tag_h4 = ifelse(`chr3:161374774:C:T` == "T", "H4", NA),
    tag_h11c_temp = ifelse(`chr3:161344843:T:C` == "C" & `chr3:161359842:A:G` == "A", "H11c", NA)
  )

# Step 2: Assign H11c to the row with the maximum freq_cases among those where tag_h11c_temp is H11c
# merged_haplo_stats <- merged_haplo_stats %>%
#   mutate(
#     tag_h11c = ifelse(tag_h1c_temp == "H11c" & 
#                      freq_cases == max(freq_cases[tag_h11c_temp == "H11c"], na.rm = TRUE), "H11c", NA)
#   ) %>%
#   select(-tag_h11c_temp) # Drop the temporary column

# # Define the labels for tag_h1sub
# labels_h11sub <- c('H11b', 'H11d', 'H11e', 'H11f', 'H11g', 'H11i', 'H11j', 'H11k', 'H11m', 'H11n', 'H11o', 'H11p', 'H11q', 'H11r', 'H11x', 'H11y', 'H11z')

# # Create the 'tag_h1sub' column based on specified conditions
# merged_haplo_stats <- merged_haplo_stats %>%
#   # Filter for relevant rows and sort them
#   arrange(desc(freq_cases)) %>%
#   mutate(tag_h11sub = case_when(
#     !is.na(tag_h11) & is.na(tag_h4) & is.na(tag_h11c) ~ labels_h11sub[row_number()],
#     TRUE ~ NA_character_
#   ))

# # Create the 'haplotype_tag' column based on the first non-NA value from tag_h4, tag_h11c, and tag_h11sub
# merged_haplo_stats <- merged_haplo_stats %>%
#   mutate(haplotype_tag = coalesce(tag_h4, tag_h11c, tag_h11sub))

# # Sort the data frame by haplotype_tag
# merged_haplo_stats <- merged_haplo_stats %>%
#   arrange(haplotype_tag)

### Save SPTSSB Happlotype analysis
# Save the data frame as a tab-delimited file
write.table(merged_haplo_stats, file = "table_SPTSSB_haplotype_association_with_pd_case_control_Jeff_vars.tsv", sep = "\t", row.names = FALSE, quote = TRUE)
## Save selected columns for paper

#### Save Haplotype Analysis Table
* Table: Association between MAPT haplotype and PD risk

In [None]:
# Select the specified columns
output_sptssb_analysis <- merged_haplo_stats %>%
  select(
#    haplotype_tag,
    `chr3:161344843:T:C`,
    `chr3:161359842:A:G`,
    `chr3:161366258:A:C`,
    `chr3:161372828:C:T`,
    `chr3:161374617:T:C`,
    `chr3:161374774:C:T`,
    `chr3:161375544:G:A`,
    `chr3:161376002:CA:C`,
    freq_cases,
    freq_controls,
    OR_95_CI,
    pval,
    haplo_index,
    pval_corrected,
    frequency,
    se,
    coef,
    CI_lower,
    CI_upper
  )

# Save the output as a tab-delimited file
write.table(output_sptssb_analysis, file = "table_SPTSSB_haplotype_association_with_pd_case_control_paper_Jeff_vars.tsv", sep = "\t", row.names = FALSE, quote = TRUE)