## Data analysis
https://www.r-graph-gallery.com/101_Manhattan_plot.html

In [None]:
[global]
# Working directory
parameter: out_dir = path('/Users/dianacornejo/pleiotropy_UKB')
#Output merged summary stats file
parameter: name = 'summary_stats.txt.gz'

Merge summary stats files using python

In [None]:
import glob
file_list = [f for f in glob.glob("*.snp_stats.bgen.gz")]
with open(f'{out_dir}/{name}',"w") as outfile:
    with open(file_list[0]) as f1:
        for line in f1:        #keep the header from file1
            outfile.write(line)

    for x in file_list[1:]:
        with open(x) as f1:
            for line in f1:
                if not line.startswith("SNP"):
                    outfile.write(line)

First of all set the working directory. This is where the BOLT-LMM association results are located.

In [None]:
setwd(${out_dir})

In [None]:
mydir = "BMI"

Install and load required libraries

In [None]:
#install.packages("reader")
#intall.packages("plyr")
#install.packages("ggplot2")
#library(readr)
#library(plyr)
#library(ggplot2)

Then create a list of files to be combined using R. In this case the summary statistics for the association analysis and then create the dataset by binding files from chr{1:22}. The drawback is that R runs out of memory when doing this

In [None]:
#file_list = Sys.glob("*.snp_stats.bgen.gz") #one way of creating lists
#file_list = list.files(path=mydir, pattern="*.snp_stats.bgen.gz", full.names=TRUE) #another way of creating lists
#data.list = lapply(file_list, function(x){read.table(file = x,header = TRUE, sep = "\t")})
#data.merged = do.call("rbind", data.list

In [None]:
Create the manhattan plot 

In [None]:
#manhattan(gwasResults, chr="CHR", bp="BP", snp="SNP", p="P" )

# Let's highlight them, with a bit of customization on the plot
#manhattan(gwasResults, highlight = snpsOfInterest)
#Annotate the SNPs of interest
#manhattan(gwasResults, annotatePval = 0.01)
#Make a qqplot
#qq(gwasResults$P)

### Distribution of quantitative traits, qq-plots, standadization and inverse rank normalization

For this step I made a copy of the file `UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120` to my local directory `/home/dc2325/phenotypes_UKB`

Then start R, install and load the required packages

In [None]:
install.packages("ggplot2")
install.packages("RNOmni")
install.packages("ggpubr")
library(ggpubr)
library(RNOmni)
library(ggplot2)

Evaluate the distribution of quantitative traits via histograms and QQ plots

In [None]:
# Clean any existing variables for safety
rm(list=ls())

# Set the working directory
setwd("~/phenotypes_UKB")

# Read in the data, in this case space delimited

df <- read.table("UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120", header=TRUE,sep = " ", dec = ".")

# list the first 5 and list 5 rows of the data
head(df,5) 
tail(df,5)

# summary of contents
attributes(df) 

#another way to view the data
glimpse(df)

# Display some summary statistics of the data
summary(df) 

# Display the structure of the data frame
str(df) 

# Make sure your data is a data frame
class(df)

# Check for missing values in each variable or in the whole dataset
is.na(df$BMI)
colSums(is.na(df))

# Make an histogram using ggplot2 for the quantitative traits in UKB
# BMI, waist circumference, hip circumference, and waist to hip ratio

bmi_plot_density = ggplot(df, aes(x=BMI)) + 
  geom_histogram(aes(y=..density..), binwidth=1, color="black", fill="white") +
  scale_color_grey() + scale_fill_grey() +
  scale_x_continuous(breaks=seq(10,70,10))  +
  labs(title="Histogram for BMI", x="BMI") +
  theme_classic()
png("Density_plot_BMI_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(bmi_plot_density)
dev.off()

waist_plot_density = ggplot(df, aes(x=WAIST)) + 
  geom_histogram(aes(y=..density..), binwidth=2, color="black", fill="white")+
  scale_color_grey() + scale_fill_grey() +
  scale_x_continuous(breaks=seq(50,200,50)) +
  labs(title="Histogram for waist circumference", x="waist circumference") +
  theme_classic()
png("Density_plot_waist_circumference_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(waist_plot_density)
dev.off()

hip_plot_density = ggplot(df, aes(x=HIP)) + 
  geom_histogram(aes(y=..density..), binwidth=2, color="black", fill="white")+
  scale_color_grey() + scale_fill_grey() +
  scale_x_continuous(breaks=seq(50,200,50)) +
  scale_y_continuous(breaks=seq(0,0.05,0.01)) +
  labs(title="Histogram for hip circumference", x="hip circumference") +
  theme_classic()
png("Density_plot_hip_circumference_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(hip_plot_density)
dev.off()

# Calculate waist to hip ratio and append to data frame

WHR = (df$WAIST/df$HIP)
df$WHR = WHR

# Plot density histogram for waist-to-hip ratio

whr_plot_density = ggplot(df, aes(x=WHR)) + 
  geom_histogram(aes(y=..density..), binwidth = 0.05, color="black", fill="white")+
  scale_color_grey() + scale_fill_grey() +
  labs(title="Histogram for waist-to-hip ratio", x="WHR") +
  theme_classic()
png("Density_plot_whr_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(whr_plot_density)
dev.off()

# Make the qqplots with ggpubr

bmi_qqplot = ggqqplot(df$BMI, title="QQ plot BMI UKB Caucasian")
png("QQplot_BMI_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(bmi_qqplot)
dev.off()

waist_circum_qqplot = ggqqplot(df$WAIST, title="QQ plot Waist circumference UKB Caucasian" )
png("QQplot_WAIST_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(waist_circum_qqplot)
dev.off()

hip_circum_qqplot = ggqqplot(df$HIP, title="QQ plot Hip circumference UKB Caucasian")
png("QQplot_HIP_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(hip_circum_qqplot)
dev.off()

whr_qqplot = ggqqplot(df$WHR, title="QQ plot Waist to hip ratio UKB Caucasian")
png("QQplot_WHR_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(whr_qqplot)
dev.off()

# Perform inverse rank normalization for traits with non-normal distribution (e.x BMI)
# For this the RNOmni command requires the removal of missing obervations

# Create a data frame with complete cases for BMI
df_complete_BMI = df[complete.cases(df$BMI), ]

# Make sure you did the right thing
colSums(is.na(df_complete_BMI)) 

# Now apply rank-based inverse normalization (INT) using library(RNOmni) and append new variable

df_complete_BMI$rankNorm_BMI = rankNorm(df_complete_BMI$BMI)

# Make the corresponding graphs to visualize the data

INT_BMI_density_plot = ggplot(df_complete_BMI, aes(x=rankNorm_BMI)) + 
  geom_histogram(aes(y=..density..), 
                 col="black", 
                 fill="white", 
                 alpha = .2) + 
  labs(title="Density plot for INT-BMI", x="rankNorm_BMI") + 
  theme_classic()
png("Density_plot_INT-BMI_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(INT_BMI_density_plot)
dev.off()
    
INT_BMI_qqplot = ggplot(df_complete_BMI,aes(sample=rankNorm_BMI)) + 
  stat_qq() +
  labs(title="QQ plot for INT-BMI UKB-EUR")+
  theme_classic()
png("QQplot_INT-BMI_UKB_Caucasian.png", width = 6, height = 4, unit="in", res=300)
print(INT_BMI_qqplot)
dev.off()

# Write new dataframe to a space delimited file

write.table(df_complete_BMI, file = "UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720", append = FALSE, sep = " ", dec = ".",
            row.names = FALSE, col.names = TRUE, quote=FALSE)
