forked from dmgatti/MUGA_reference_data
/
MegaMUGA_Reference_QC.Rmd
executable file
·1729 lines (1487 loc) · 91.3 KB
/
MegaMUGA_Reference_QC.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Reference Data QC: MegaMUGA"
site: workflowr::wflow_site
output:
workflowr::wflow_html:
toc: true
toc_float: true
toc_collapsed: true
toc_depth: 3
editor_options:
chunk_output_type: console
---
# MegaMUGA Annotations
```{r setup, echo=FALSE, include=FALSE}
knitr::opts_chunk$set(message = FALSE)
# library(tidyverse)
# ^ this if running locally
require(dplyr)
require(tidyr)
require(stringr)
# ^ these if running on sumner
require(data.table)
require(purrr)
require(furrr)
require(qtl2)
require(magrittr)
require(DT)
require(plotly)
require(progress)
require(ape)
require(RColorBrewer)
require(furrr)
future::plan(multicore)
today <- format(Sys.Date(), format="%Y%m%d")
QCtheme <- theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(colour = "black"),
axis.title = element_text(colour = "black"))
###########################################
# Function designed to recode genotype calls from letter format (i.e. G1, HET, or G2) to numeric format (i.e. 0, 1, 2)
# Inputs:
# x = Column of genotype values
# Outputs:
# Numeric vector of recoded genotypes
###########################################
recodeCalls <- function(x){
y <- factor(c(as.matrix(x)))
levels(y)[which(levels(y) == "H")] <- 1
levels(y)[which(levels(y) != 1)] <- c(0,2)
return(as.numeric(as.character(y)))
}
###########################################
# Function to form F1 genotypes from founder consensus genotypes
# Inputs:
# x = Row of genotype values
# Outputs:
# Numeric vector of single letter genotypes
###########################################
callGeno <- function(x){
# Compare genotypes from both strains
if(c(x[[2]] == x[[3]])){
#If they are the same, keep the first value
predicted.geno <- x[[2]]
} else {
#If they are different, code as a het
predicted.geno <- "H"
}
return(predicted.geno)
}
###########################################
# Function used in the loop which forms founder consensus genotypes prior to background QC
# Inputs:
# mk = marker
# data = genotype data for each founder sample
# f = founder strain
# Outputs:
# data frame with 1 row with columns: marker; consensus genotype for founder strain f
###########################################
removeExtremeInts <- function(mk, data, f){
# Calculate summary statistics for probe intensities
sd.x.int <- sd(data$x_int)
sd.y.int <- sd(data$y_int)
mean.x.int <- mean(data$x_int)
mean.y.int <- mean(data$y_int)
data %>%
# With a prior expectation provided by an "N" call, determine whether probe intensities are unusual and
# flag samples that meet both criteria
dplyr::mutate(x.ex = dplyr::if_else(x_int > (mean.x.int + sd.x.int) | x_int < (mean.x.int - sd.x.int), true = "EX", false = ""),
y.ex = dplyr::if_else(y_int > (mean.y.int + sd.y.int) | y_int < (mean.y.int - sd.y.int), true = "EX", false = ""),
flag = dplyr::if_else(genotype == "N" & (x.ex == "EX" | y.ex == "EX"), true = "FLAG", false = "")) %>%
dplyr::filter(flag != "FLAG") %>%
dplyr::mutate(marker = mk) %>%
dplyr::distinct(marker,genotype) %>%
dplyr::select(marker,genotype) %>%
`colnames<-`(c("marker",f))
}
###########################################
# Function to form F1 genotypes from founder consensus genotypes
# Inputs:
# x = Row of genotype values
# Outputs:
# Numeric vector of single letter genotypes
###########################################
callHemiGeno <- function(x){
# Mitochondria comes from dam
predicted.geno <- x[[2]]
return(predicted.geno)
}
###########################################
# Function to call expected Chr X genotypes for F1 samples based on sample sex and calculate concordance between expected and observed F1 genotypes
# Inputs:
# sp = Sample name
# isex = Sample sex
# data = Sample genotype data
# cross = F1 genotype predictions
# chrX_markers = data frame of chromosome X marker names
# Outputs:
# geno_comp_Xrecoded = data frame of sample genotypes and concordance with expected F1 genotypes
###########################################
callXGeno <- function(sp, isex, data, cross, chrX_markers){
if(isex == "m"){
# Filter expected genotypes to just X chromosome markers
male_F1 <- cross %>%
dplyr::filter(marker %in% chrX_markers$marker) %>%
dplyr::ungroup()
# Use callHemiGeno function to assign maternal genotypes as expected X genotypes in males
male_F1$predicted_genotypes <- apply(male_F1, 1, callHemiGeno)
male_recoded_cross <- cross %>%
dplyr::filter(!marker %in% chrX_markers$marker) %>%
dplyr::bind_rows(.,male_F1)
# Calculate the concordance between genotypes using hemizygous X calls
geno_comp_Xrecoded <- data %>%
dplyr::inner_join(.,male_recoded_cross %>%
dplyr::ungroup() %>%
dplyr::select(marker,predicted_genotypes)) %>%
dplyr::mutate(matching_genos = dplyr::if_else(genotype == predicted_genotypes,
true = "MATCH",
false = "NO MATCH"),
alt_chr = dplyr::case_when(chr == "M" ~ "M",
chr == "X" ~ "X",
chr == "Y" ~ "Y",
is.na(chr) ~ "Other",
TRUE ~ "Autosome"),
alt_chr = as.factor(alt_chr),
sample = sp,
inferred.sex = isex)
} else {
# Calculate the concordance between genotypes using diploid X calls
geno_comp_Xrecoded <- data %>%
dplyr::inner_join(.,cross %>%
dplyr::ungroup() %>%
dplyr::select(marker,predicted_genotypes)) %>%
dplyr::mutate(matching_genos = dplyr::if_else(genotype == predicted_genotypes,
true = "MATCH",
false = "NO MATCH"),
alt_chr = dplyr::case_when(chr == "M" ~ "M",
chr == "X" ~ "X",
chr == "Y" ~ "Y",
is.na(chr) ~ "Other",
TRUE ~ "Autosome"),
alt_chr = as.factor(alt_chr),
sample = sp,
inferred.sex = isex)
}
return(geno_comp_Xrecoded)
}
###########################################
# Function to form reference sample genotypes for comparison
# Inputs:
# Consensus genotypes for dam strain
# Consensus genotypes for sire strain
# Outputs:
# Numeric vector of recoded genotypes
###########################################
founder_background_QC <- function(dam, sire){
# Extract strain names from genotype objects\
dam.df <- data.frame(dam)
sire.df <- data.frame(sire)
dam_strain <- gsub(colnames(dam.df)[2], pattern = "X", replacement = "")
sire_strain <- gsub(colnames(sire.df)[2], pattern = "X", replacement = "")
if(dam_strain == sire_strain){
print(paste0("Running QC: ", dam_strain))
} else {
print(paste0("Running QC: (", dam_strain, "x", sire_strain, ")F1"))
}
# Identify samples from supplied strains
maternalF1s <- all_founder_samples %>%
dplyr::filter(dam == dam_strain,
sire == sire_strain) %>%
dplyr::mutate(sire = as.factor(sire),
dam = as.factor(dam))
if(nrow(maternalF1s) == 0){
# Some crosses don't exist in reference data, can't be QC'd
return("No samples from this cross; skipping")
} else {
# Remove consensus genotypes for each strains that are N's
mom <- dam.df[which(!dam.df[,2] %in% c("H","N")),]
dad <- sire.df[which(!sire.df[,2] %in% c("H","N")),]
# Form a hypothetical F1 hybrid by combining genotypes from markers that exist in both strains
if(dam_strain == sire_strain){
cross <- dplyr::inner_join(mom[complete.cases(mom),],dad[complete.cases(dad),], "marker")
} else {
cross <- dplyr::inner_join(mom[complete.cases(mom),],dad[complete.cases(dad),])
}
# Form a list of mitochondrial markers
alt_chr_M <- genos.flagged %>%
dplyr::filter(chr %in% c("M")) %>%
dplyr::distinct(marker)
# Form a list of Chr X markers
alt_chr_X <- genos.flagged %>%
dplyr::filter(chr %in% c("X")) %>%
dplyr::distinct(marker)
# Predict F1 genotypes from consensus genotypes for each strain
predicted.genotypes <- apply(cross, 1, FUN = callGeno)
cross$predicted_genotypes <- predicted.genotypes
# Filter the artificial F1 hybrid genotype calls down to mitochondrial markers
mito_cross <- alt_chr_M %>%
dplyr::left_join(., cross)
# Assign maternal mitochondrial genotype to predicted cross instead of hets where strains have different mitotypes
mito_cross$predicted_genotypes <- apply(mito_cross, 1, FUN = callHemiGeno)
cross_mitorecoded <- cross %>%
dplyr::filter(!marker %in% alt_chr_M$marker) %>%
dplyr::bind_rows(.,mito_cross)
# Gather genotypes for F1 samples and remove bad markers
sample_genos <- maternalF1s %>%
dplyr::right_join(genos.flagged,.,by = "sample") %>%
dplyr::filter(marker %in% mom$marker,
genotype != "N") %>%
dplyr::distinct(marker, sample, genotype, .keep_all = TRUE) %>%
dplyr::filter(marker_flag != "FLAG")
# Nest genotypes by sample and sex to prepare for calling Chr X genotypes
nested_samples <- sample_genos %>%
dplyr::group_by(sample, inferred.sex) %>%
tidyr::nest()
# Create a list of mitochondrial and X genotypes to be supplied to the callXGeno function
cross_mitorecoded_list <- list()
for(i in 1:length(nested_samples$data)){
cross_mitorecoded_list[[i]] <- cross_mitorecoded
}
alt_chr_X_list <- list()
for(i in 1:length(nested_samples$data)){
alt_chr_X_list[[i]] <- alt_chr_X
}
# Recode X genotypes to match expectations for male and female samples and form a dataframe of sample genotypes (and concordance status)
final_geno_comp <- purrr::pmap(.l = list(nested_samples$sample,
nested_samples$inferred.sex,
nested_samples$data,
cross_mitorecoded_list,
alt_chr_X_list),
.f = callXGeno) %>%
Reduce(rbind,.)
# Tabulate the percentage of concordant genotypes between theoretical and actual F1 samples
geno_comp_summary <- final_geno_comp %>%
dplyr::group_by(sample, inferred.sex, alt_chr, matching_genos) %>%
dplyr::count() %>%
tidyr::pivot_wider(names_from = matching_genos, values_from = n) %>%
dplyr::mutate(dam = dam_strain,
sire = sire_strain)
# Return genotypes and their summary statistics
return(list(final_geno_comp,geno_comp_summary))
}
}
###########################################
# Function to quickly pull strain names from non-founder samples
# Inputs:
# Sample IDs
# Outputs:
# Character vector of strains
###########################################
findStrain <- function(x){
# Decompose strain from mouse ID for males
m_strains <- strsplit(x, split = "m")[[1]]
# If the string was actually split using "m" the sample was male
if(TRUE %in% str_detect(m_strains, pattern = "X12")){
# If there was a 129 or similar mouse with an m in the strain name, some collapsing needs to occur
X129_strains <- paste(strsplit(x, split = "m")[[1]][[1]], sep = "m", collapse = "m")
return(X129_strains)
} else if(length(m_strains) == 1){
f_strains <- strsplit(x, split = "f")[[1]][[1]]
return(f_strains)
} else {
return(m_strains[[1]][[1]])
}
}
###########################################
# Function to re-derive founder consensus genotypes for founders that are missing consensus calls due to one or a few bad calls
# Inputs:
# mk = marker name
# data = data frame with founder strain samples, intensities, individual sample genotype calls , and existing consensus genotype calls
# Outputs:
# 1) Consensus genotype calls for each founder strain
# 2) Wide genotype table for all founder samples
# 3) Data frame with recoded consensus genotypes for each samples with intensity data and strain names, as well as a flag for whether the sample contributed to a recoded consensus genotype
###########################################
findConsensusGenotypes <- function(mk, data){
# Examples use mk = "UNC13666424"
# Annotate sample genotype data with summed intensities as a heuristic and filter to sample genotypes assigned a real genotype
call_filtered_data <- data %>%
dplyr::mutate(sum_int = x_int+y_int) %>%
dplyr::filter(sample_genotype %in% c("A","C","T","G","H"))
# Example:
# Note: note NOD samples missing a consensus genotype, see below for expected results
# strain consensus_genotype sample x_int y_int sample_genotype sum_int
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 129S1/SvImJ A X129S1.SvImJf0827.1 0.716 0.083 A 0.799
# 2 129S1/SvImJ A X129S1.SvImJm 0.618 0.079 A 0.697
# 3 129S1/SvImJ A X129S1.SvImJm.1 0.599 0.084 A 0.683
# 4 129S1/SvImJ A X129S1.SvImJm1314 0.69 0.097 A 0.787
# 5 NOD/ShiLtJ NA NOD.ShiLtJf0713.1 0.567 0.07 A 0.637
# 6 NOD/ShiLtJ NA NOD.ShiLtJf0713.2 0.6 0.095 A 0.695
# 7 NOD/ShiLtJ NA NOD.ShiLtJf0713.3 0.883 0.091 A 0.974
# 8 NOD/ShiLtJ NA NOD.ShiLtJm0150 0.646 0.088 A 0.734
# 9 NOD/ShiLtJ NA NOD.ShiLtJm1214 0.491 0.057 A 0.548
# 10 NOD/ShiLtJ NA NOD.ShiLtJm35324 0.538 0.077 A 0.615
# 11 NOD/ShiLtJ NA NOD.ShiLtJm39173 0.649 0.087 A 0.736
# Derive a lower bound by which to identify true N calls using the intensity data
real_N_cutoff <- quantile(call_filtered_data$sum_int, probs = seq(0,1,0.05))[[2]]
# Use this threshold to identify potential miscalls and remove them in order to recode samples without consensus calls
filtered_data <- call_filtered_data %>%
dplyr::mutate(test = if_else(sum_int < real_N_cutoff & is.na(consensus_genotype), true = "miscall", false = "")) %>%
dplyr::filter(test != "miscall")
# Determine the number of clusters to use for k-means clustering of intensity values;
# If the previous steps have succeeded, there should be one or two real genotypes segregating and samples can be re-called by k-means
n_genos <- length(unique(filtered_data$sample_genotype[which(filtered_data$sample_genotype %in% c("A","C","T","G","H"))]))
ints_for_kmeans <- filtered_data %>%
dplyr::select(x_int, y_int)
consensus_geno_clusters <- kmeans(ints_for_kmeans, centers = n_genos)$cluster
# Joining each sample's cluster assignment to the sample intensity metrics
genos.k.means <- filtered_data %>%
dplyr::mutate(clust = as.factor(consensus_geno_clusters))
# Create a join table to pair up clusters with genotypes
gens_by_cluster_tab <- genos.k.means %>%
dplyr::group_by(sample_genotype, consensus_genotype, clust) %>%
dplyr::count() %>%
arrange(-n)
# Example:
# sample_genotype consensus_genotype clust n
# 1 G G 2 32
# 2 A A 1 23
# 3 A NA 1 6
# Filter the join table to hopefully have a 1:1 relationship between consensus genotypes and clusters
top_clusters <- gens_by_cluster_tab %>%
dplyr::filter(!is.na(consensus_genotype)) %>%
dplyr::ungroup() %>%
dplyr::distinct(sample_genotype, clust) %>%
dplyr::rename(consensus_genotype = sample_genotype)
# Example:
# consensus_genotype clust
# 1 G 2
# 2 A 1
# Samples are then recoded according to the k-means assigned genotypes and noted whether a consensus was re-assigned
unknown_before <- filtered_data %>%
dplyr::filter(is.na(consensus_genotype))
recoded_consensus_genotypes <- genos.k.means %>%
dplyr::select(-consensus_genotype) %>%
dplyr::left_join(.,top_clusters, by = "clust") %>%
dplyr::mutate(marker = mk) %>%
dplyr::mutate(recoded = dplyr::if_else(sample %in% unknown_before$sample, true = "RECODED", false = "")) %>%
dplyr::select(-clust)
# Example: recoded_consensus_genotypes[20:30,]
# Note NOD samples are now recoded with the consensus genotype predicted from k-means across all founder samples
# strain sample x_int y_int sample_genotype sum_int test consensus_genotype marker recoded
# <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
# 1 129S1/SvImJ X129S1.SvImJf0827.1 0.716 0.083 A 0.799 "" A UNC13666424 ""
# 2 129S1/SvImJ X129S1.SvImJm 0.618 0.079 A 0.697 "" A UNC13666424 ""
# 3 129S1/SvImJ X129S1.SvImJm.1 0.599 0.084 A 0.683 "" A UNC13666424 ""
# 4 129S1/SvImJ X129S1.SvImJm1314 0.69 0.097 A 0.787 "" A UNC13666424 ""
# 5 NOD/ShiLtJ NOD.ShiLtJf0713.1 0.567 0.07 A 0.637 "" A UNC13666424 "RECODED"
# 6 NOD/ShiLtJ NOD.ShiLtJf0713.2 0.6 0.095 A 0.695 "" A UNC13666424 "RECODED"
# 7 NOD/ShiLtJ NOD.ShiLtJf0713.3 0.883 0.091 A 0.974 "" A UNC13666424 "RECODED"
# 8 NOD/ShiLtJ NOD.ShiLtJm0150 0.646 0.088 A 0.734 "" A UNC13666424 "RECODED"
# 9 NOD/ShiLtJ NOD.ShiLtJm35324 0.538 0.077 A 0.615 "" A UNC13666424 "RECODED"
# 10 NOD/ShiLtJ NOD.ShiLtJm39173 0.649 0.087 A 0.736 "" A UNC13666424 "RECODED"
# 11 NZO/HILtJ NZO.HILtJf0588 0.041 0.728 G 0.769 "" G UNC13666424 ""
# Generate a wide table of consensus as expected for output file
recoded <- recoded_consensus_genotypes %>%
dplyr::distinct(marker, strain, consensus_genotype) %>%
tidyr::pivot_wider(names_from = strain, values_from = consensus_genotype)
# marker `A/J` `C57BL/6J` `129S1/SvImJ` `NOD/ShiLtJ` `NZO/HILtJ` `CAST/EiJ` `PWK/PhJ` `WSB/EiJ`
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 UNC13666424 G G A A G A A G
# Check for any founders that still segregate a genotype within the marker
still_consensus <- recoded_consensus_genotypes %>%
dplyr::group_by(strain, consensus_genotype) %>%
dplyr::count() %>%
dplyr::group_by(strain) %>%
dplyr::count()
# If there are still discrepancies (i.e. non-N genotypes segregating among samples), return the equivalent of the input data for downstream work
if(2 %in% still_consensus$n){
original_consensus <- suppressWarnings(data %>%
dplyr::mutate(marker = mk) %>%
dplyr::distinct(marker, strain, consensus_genotype) %>%
tidyr::pivot_wider(names_from = strain, values_from = consensus_genotype))
original_sample_calls <- suppressWarnings(data %>%
dplyr::mutate(marker = mk) %>%
dplyr::distinct(marker, sample, sample_genotype) %>%
tidyr::pivot_wider(names_from = sample,
values_from = sample_genotype))
original_data <- data %>%
dplyr::mutate(marker = mk,
recoded = "")
return(list(original_consensus, original_sample_calls, original_data))
} else{
sample_new_consensus_calls <- recoded_consensus_genotypes %>%
dplyr::distinct(marker, sample, consensus_genotype) %>%
tidyr::pivot_wider(names_from = sample,
values_from = consensus_genotype)
return(list(recoded, sample_new_consensus_calls, recoded_consensus_genotypes))
}
}
```
## Reading in reference genotypes and metadata
First I read in the reference sample genotypes, as well as marker annotations from an [analysis](https://github.com/kbroman/MUGAarrays) previously conducted by Karl Broman, Dan Gatti, and Belinda Cornes.
```{r Reading in reference genotypes and metadata}
# Reading in reference sample genotype data
control_genotypes <- suppressWarnings(data.table::fread(cmd = "unzip -cq data/MegaMUGA/control.genotypes.csv.zip",
check.names = T))
colnames(control_genotypes)[1] <- "marker"
# Later in analysis, lowercase "i" in strain ID eliminates a founder sample from background QC
colnames(control_genotypes)[which(str_detect(colnames(control_genotypes), pattern = "NZO.HiLtJm36511"))] <- "NZO.HILtJm36511"
## Reading in marker annotations fro Broman, Gatti, & Cornes analysis
mm_metadata <- data.table::fread("data/MegaMUGA/mm_uwisc_v2.csv")
```
## Marker QC: Searching for missing genotype calls
We searched for probes where many mice are missing genotype calls.
```{r Markers with high "N" counts among reference samples}
## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
dplyr::group_by(marker, genotype) %>%
dplyr::count() %>%
# Result: number of genotype calls for each marker across all samples
# i.e.
# marker genotype n
# B6_01-033811444-S A 8
# B6_01-033811444-S H 2
# B6_01-033811444-S N 1
dplyr::ungroup() %>%
dplyr::group_by(marker) %>%
dplyr::mutate(freq = round(n/sum(n), 3),
genotype = as.factor(genotype)) %>%
# Result: allele frequency calls for each marker across all samples
# i.e.
# marker genotype n freq
# B6_01-033811444-S A 8 0.022
# B6_01-033811444-S H 2 0.005
# B6_01-033811444-S N 1 0.003
# B6_01-033811444-S T 353 0.970
dplyr::left_join(., mm_metadata)
## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
dplyr::ungroup() %>%
dplyr::filter(genotype == "N") %>%
tidyr::pivot_wider(names_from = genotype,
values_from = n) %>%
dplyr::select(marker, chr, bp_grcm39, freq) %>%
dplyr::mutate(chr = as.factor(chr))
## Identifying markers with missing genotypes at a frequency higher than the 95th percentile of "N" frequencies across all markers
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
dplyr::filter(freq > cutoff)
```
Of `r length(unique(as.factor(control_allele_freqs$marker)))` markers, `r nrow(no.calls)` failed to genotype at least one sample, and `r nrow(above.cutoff)` markers failed to genotype at least `r cutoff*100`% of samples.
```{r Plotting no calls, echo=FALSE}
# Returns a table of bad markers with all available metadata
above.cutoff %>%
dplyr::left_join(.,mm_metadata) %>%
dplyr::mutate(chr = as.factor(chr)) %>%
dplyr::arrange(marker) %>%
DT::datatable(., filter = "top",
escape = FALSE,
options = list(columnDefs = list(list(width = '20%', targets = c(8)))))
# Distribution of N frequencies across all markers. Dotted line indicates the 95th percentile of N frequencies (cutoff).
ggplot(no.calls, mapping = aes(x = freq)) +
geom_histogram(bins = 100) +
scale_x_continuous(breaks = seq(0,1,0.1)) +
geom_vline(xintercept = cutoff, linetype = 2) +
QCtheme +
labs(x = "Fraction of mice with missing genotypes",
y = "Number of markers")
```
## Sample QC
### Searching for samples with poor marker representation
In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains with identical names meant that genotype counts for each marker among them couldn't be grouped and tallied, so determining no-call frequency occurred column-wise. Mouse over individual samples to see the number of markers with missing genotypes for each sample.
```{r Reference samples with high missingness}
## Calculating the number of missing markers for each sample
n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)],
MARGIN = 2,
function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
dplyr::rename(n.no.calls = n.calls.strains)
# n.no.calls sample
# 2355 (129S1/SvImJxA/J)F1f0056
# 2204 (129S1/SvImJxA/J)F1f0056
# 2171 (129S1/SvImJxC57BL/6J)F1f15916
# 2348 (129S1/SvImJxC57BL/6J)F1m15914
# 2232 (129S1/SvImJxCAST/EiJ)F1f005
# 2328 (129S1/SvImJxCAST/EiJ)F1m002
# 2291 (129S1/SvImJxNOD/ShiLtJ)F1f0063
# 2197 (129S1/SvImJxNZO/HILtJ)F1f0005
# 2316 (129S1/SvImJxNZO/HILtJ)F1m0004
## Interactive plot of the number of missing genotypes for each sample.
bad_sample_cutoff <- quantile(n.calls.strains.df$n.no.calls, probs = seq(0,1,0.05))[20]
high.n.samples <- n.calls.strains.df %>%
dplyr::filter(n.no.calls > bad_sample_cutoff)
sampleQC <- ggplot(n.calls.strains.df,
mapping = aes(x = reorder(sample,n.no.calls),
y = n.no.calls,
text = paste("Sample:", sample))) +
geom_point() +
QCtheme +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
geom_hline(yintercept = bad_sample_cutoff, linetype = 2) +
labs(x = "Number of mice with missing genotypes",
y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))
```
### Validating sex of reference samples
We next validated the sexes of each sample using sex chromosome probe intensities. We paired up probe intensities, joined available metadata, and filtered down to only markers covering the X and Y chromosomes.
```{r Filtering to Chr X Markers}
## Reading in genotype intensities
x_intensities <- suppressWarnings(data.table::fread(cmd = "unzip -cq data/MegaMUGA/control.X.csv.zip",
check.names = T))
colnames(x_intensities)[1] <- "marker"
colnames(x_intensities)[which(str_detect(colnames(x_intensities), pattern = "NZO.HiLtJm36511"))] <- "NZO.HILtJm36511"
y_intensities <- suppressWarnings(data.table::fread(cmd = "unzip -cq data/MegaMUGA/control.Y.csv.zip",
check.names = T))
colnames(y_intensities)[1] <- "marker"
colnames(y_intensities)[which(str_detect(colnames(y_intensities), pattern = "NZO.HiLtJm36511"))] <- "NZO.HILtJm36511"
## Check to see if dimensions of intensity tables are identical, marker orders identical, and sample orders identical
if((unique(dim(x_intensities) == dim(y_intensities)) &&
unique(colnames(x_intensities) == colnames(y_intensities)) &&
unique(x_intensities$marker == y_intensities$marker)) == TRUE){
## Pivoting the data longer
x_int_long <- x_intensities %>%
tidyr::pivot_longer(cols = -marker,
names_to = "sample",
values_to = "x_int")
y_int_long <- y_intensities %>%
tidyr::pivot_longer(cols = -marker,
names_to = "sample",
values_to = "y_int")
long_intensities <- cbind(x_int_long, y_int_long)
} else {
print("Source intensity data frames have non-identical structure; exiting")
}
## Joining slimmer intensity files with marker metadata and reducing to markers on sex chromosomes
long_XY_intensities <- long_intensities[,c(1,2,3,6)] %>%
dplyr::left_join(., mm_metadata) %>%
dplyr::filter(chr %in% c("X","Y"))
# Expected output
# marker ample x_int y_int chr bp_mm10 bp_grcm39 cM_cox strand snp unique
# XiD1 X.129S1.SvImJxA.J.F1f0056 1.161 0.094 X 102827921 101871527 44.17434 plus TG TRUE
# XiD1 X.129S1.SvImJxA.J.F1f0056.1 1.034 0.054 X 102827921 101871527 44.17434 plus TG TRUE
# XiD1 X.129S1.SvImJxC57BL.6J.F1f15916 0.805 0.068 X 102827921 101871527 44.17434 plus TG TRUE
# XiD1 X.129S1.SvImJxC57BL.6J.F1m15914 0.371 0.035 X 102827921 101871527 44.17434 plus TG TRUE
# XiD1 X.129S1.SvImJxCAST.EiJ.F1f005 0.696 0.040 X 102827921 101871527 44.17434 plus TG TRUE
# XiD1 X.129S1.SvImJxCAST.EiJ.F1m002 0.591 0.041 X 102827921 101871527 44.17434 plus TG TRUE
# unmapped probe strand_flipped
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
# FALSE CTGCCTTCAAAAGTGCTGGGATTAAAATGATGAGCGAGCAATGCCCAGCC FALSE
```
Then we flagged markers with high missingness across all samples, as well as samples with high missingness among all markers.
```{r Flagging "low-quality" markers and samples}
## Flagging markers and samples based on previous QC steps
flagged_XY_intensities <- long_XY_intensities %>%
dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
true = "FLAG",
false = "")) %>%
dplyr::mutate(high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
true = "FLAG",
false = ""))
```
The first round of inferring predicted sexes used a rough search of the sample name for expected nomenclature convention, which includes a sex denotation.
```{r Preliminary sex prediction}
## First round of predicted sex inference
## Input: flagged XY intensities
prelim.predicted.sexes <- flagged_XY_intensities %>%
dplyr::mutate(bg = dplyr::case_when(stringr::str_detect(string = sample,
pattern = "F1") == TRUE ~ "F1",
TRUE ~ "unknown"),
predicted.sex = dplyr::case_when(stringr::str_detect(string = sample,
pattern = "F1f") == TRUE ~ "f",
stringr::str_detect(string = sample,
pattern = "F1m") == TRUE ~ "m",
TRUE ~ "unknown"))
## Output: flagged intensities with preliminary sex predictions and background assignments among F1 hybrids
# prelim.predicted.sexes[764:789,] %>%
# dplyr::select(marker, sample, marker_flag, high_missing_sample, predicted.sex)
# marker sample marker_flag high_missing_sample predicted.sex
# 764 XiE2 X.C57BL.6JxNOD.ShiLtJ.F1f0018 FLAG f
# 765 XiE2 X.C57BL.6JxNZO.HILtJ.F1f0016 FLAG f
# 766 XiE2 X.C57BL.6JxNZO.HILtJ.F1m15853 FLAG m
# 767 XiE2 X.C57BL.6JxPWK.PhJ.F1f002 FLAG f
# 768 XiE2 X.C57BL.6JxPWK.PhJ.F1m005 FLAG m
# 769 XiE2 X.C57BL.6JxPWK.PhJ.F1m005.1 FLAG m
# 770 XiE2 X.C57BL.6JxSJL.J.F1m35973 FLAG FLAG m
# 771 XiE2 X.C57BL.6JxWSB.EiJ.F1f0300 FLAG f
# 772 XiE2 X.C57BL.6JxWSB.EiJ.F1m15714 FLAG m
# 773 XiE2 X.CAST.EiJx129S1.SvImJ.F1f012 FLAG f
# 774 XiE2 X.CAST.EiJx129S1.SvImJ.F1m001 FLAG m
# 775 XiE2 X.CAST.EiJxA.J.F1f002 FLAG f
# 776 XiE2 X.CAST.EiJxA.J.F1f002.1 FLAG f
# 777 XiE2 X.CAST.EiJxA.J.F1m005 FLAG m
# 778 XiE2 X.CAST.EiJxC57BL.6J.F1m FLAG m
# 779 XiE2 X.CAST.EiJxC57BL.6J.F1m.1 FLAG m
# 780 XiE2 X.CAST.EiJxNOD.ShiLtJ.F1f007 FLAG f
# 781 XiE2 X.CAST.EiJxNOD.ShiLtJ.F1f007.1 FLAG f
# 782 XiE2 X.CAST.EiJxNZO.HILtJ.F1f FLAG f
# 783 XiE2 X.CAST.EiJxNZO.HILtJ.F1f.1 FLAG f
# 784 XiE2 X.CAST.EiJxNZO.HILtJ.F1m FLAG m
# 785 XiE2 X.CAST.EiJxNZO.HILtJ.F1m.1 FLAG m
# 786 XiE2 X.CAST.EiJxPWK.PhJ.F10123 FLAG unknown
# 787 XiE2 X.CAST.EiJxPWK.PhJ.F1f0163 FLAG f
# 788 XiE2 X.CAST.EiJxWSB.EiJ.F1f0113 FLAG f
# 789 XiE2 X.CAST.EiJxWSB.EiJ.F1m0096 FLAG m
## Filtering down to samples without preliminary sex predictions
unknown <- prelim.predicted.sexes %>%
dplyr::filter(predicted.sex == "unknown")
## Using regex searching of sample IDs to deduce the sex of each sample
###########################################################
## Key processes and expected outputs at each iteration: ##
###########################################################
#####################################################################
## 1) Extracting the a substring of X digits into the sample name. ##
#####################################################################
# mouse.id.X = stringr::str_sub(sample, -X)
# i.e.)
# unknown %>%
# dplyr::mutate(mouse.id.3 = stringr::str_sub(sample, -3)) %>%
# dplyr::select(sample, mouse.id.3) %>%
# head(10)
# sample mouse.id.3
# 1 X.CAST.EiJxPWK.PhJ.F10123 123
# 2 X017.FH.F1 .F1
# 3 X124S4.SvJaeJm39510 510
# 4 X129P1.ReJm35858 858
# 5 X129P2.OlaHsdm sdm
# 6 X129P2.OlaHsdm.1 m.1
# 7 X129P3.Jm37959 959
# 8 X129S1.SvImJf mJf
# 9 X129S1.SvImJf.1 f.1
# 10 X129S1.SvImJf0827 827
#####################################################################################
## 2) Assigning the predicted sex based on expected mouse nomenclature convention. ##
#####################################################################################
# predicted.sex.X = dplyr::case_when(stringr::str_sub(mouse.id.X, 1, 1) %in% c("m","M") ~ "m", stringr::str_sub(mouse.id.X, 1, 1) %in% c("f","F") ~ "f", TRUE ~ "unknown")
# i.e.)
# unknown %>%
# dplyr::mutate(mouse.id.5= stringr::str_sub(sample, -5),
# predicted.sex.5 =
# dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
# stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
# ` TRUE ~ "unknown")) %>%
# dplyr::select(sample, mouse.id.5, predicted.sex.5) %>% head(14)
# sample mouse.id.5 predicted.sex.5
# 1 X.CAST.EiJxPWK.PhJ.F10123 10123 unknown
# 2 X017.FH.F1 FH.F1 f
# 3 X124S4.SvJaeJm39510 39510 unknown
# 4 X129P1.ReJm35858 35858 unknown
# 5 X129P2.OlaHsdm aHsdm unknown
# 6 X129P2.OlaHsdm.1 sdm.1 unknown
# 7 X129P3.Jm37959 37959 unknown
# 8 X129S1.SvImJf vImJf unknown
# 9 X129S1.SvImJf.1 mJf.1 m
# 10 X129S1.SvImJf0827 f0827 f
# 11 X129S1.SvImJf0827.1 827.1 unknown
# 12 X129S1.SvImJm vImJm unknown
# 13 X129S1.SvImJm.1 mJm.1 m
# 14 X129S1.SvImJm1314 m1314 m
##############################################################################################
# 3) Inferring the strain background by removing the mouse id from the sample name when a sex is predicted. In certain cases, symbols had to be extracted prior to sex and background inference.
##############################################################################################
# bg = if_else(condition = (predicted.sex.X == "m" | predicted.sex.X == "f"),
# true = str_replace(string = bg,
# pattern = mouse.id.X,
# replacement = ""),
# false = bg)
# i.e.)
# unknown %>%
# dplyr::mutate(mouse.id.5 = stringr::str_sub(sample, -5),
# mouse.id.5 = stringr::str_replace(string = mouse.id.5,
# pattern = "[:symbol:]",
# replacement = ""),
# predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
# stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
# TRUE ~ "unknown"),
# bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
# true = str_replace(string = bg,
# pattern = mouse.id.5,
# replacement = ""),
# false = bg)) %>%
# dplyr::select(sample, mouse.id.5, predicted.sex.5, bg) %>% head(14)
# sample mouse.id.5 predicted.sex.5 bg
# 1 X.CAST.EiJxPWK.PhJ.F10123 10123 unknown F1
# 2 X017.FH.F1 FH.F1 f F1
# 3 X124S4.SvJaeJm39510 39510 unknown X124S4.SvJaeJm39510
# 4 X129P1.ReJm35858 35858 unknown X129P1.ReJm35858
# 5 X129P2.OlaHsdm aHsdm unknown X129P2.OlaHsdm
# 6 X129P2.OlaHsdm.1 sdm.1 unknown X129P2.OlaHsdm.1
# 7 X129P3.Jm37959 37959 unknown X129P3.Jm37959
# 8 X129S1.SvImJf vImJf unknown X129S1.SvImJf
# 9 X129S1.SvImJf.1 mJf.1 m X129S1.SvI
# 10 X129S1.SvImJf0827 f0827 f X129S1.SvImJ
# 11 X129S1.SvImJf0827.1 827.1 unknown X129S1.SvImJf0827.1
# 12 X129S1.SvImJm vImJm unknown X129S1.SvImJm
# 13 X129S1.SvImJm.1 mJm.1 m X129S1.SvI
# 14 X129S1.SvImJm1314 m1314 m X129S1.SvImJ
digit.trim <- unknown %>%
# One character
dplyr::mutate(mouse.id.1 = stringr::str_sub(sample, -1),
predicted.sex.1 = dplyr::case_when(stringr::str_sub(mouse.id.1, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.1, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Three characters
mouse.id.3= stringr::str_sub(sample, -3),
predicted.sex.3 = dplyr::case_when(stringr::str_sub(mouse.id.3, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.3, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Four characters
mouse.id.4 = stringr::str_sub(sample, -4),
predicted.sex.4 = dplyr::case_when(stringr::str_sub(mouse.id.4, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.4, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Five characters
mouse.id.5 = stringr::str_sub(sample, -5),
mouse.id.5 = stringr::str_replace(string = mouse.id.5, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Six characters
mouse.id.6 = stringr::str_sub(sample, -6),
mouse.id.6 = stringr::str_replace(string = mouse.id.6, ## a couple symbols in these ids mess up the regex search
pattern = "[:punct:]",
replacement = ""),
mouse.id.6 = stringr::str_replace(string = mouse.id.6, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.6 = dplyr::case_when(stringr::str_sub(mouse.id.6, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.6, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Seven characters
mouse.id.7 = stringr::str_sub(sample, -7),
mouse.id.7 = stringr::str_replace(string = mouse.id.7, ## a couple symbols in these ids mess up the regex search
pattern = "[:punct:]",
replacement = ""),
mouse.id.7 = stringr::str_replace(string = mouse.id.7, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.7 = dplyr::case_when(stringr::str_sub(mouse.id.7, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.7, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown"),
# Eight characters
mouse.id.8 = stringr::str_sub(sample, -8),
mouse.id.8 = stringr::str_replace(string = mouse.id.8, ## a couple symbols in these ids mess up the regex search
pattern = "[:punct:]",
replacement = ""),
mouse.id.8 = stringr::str_replace(string = mouse.id.8, ## a couple symbols in these ids mess up the regex search
pattern = "[:symbol:]",
replacement = ""),
predicted.sex.8 = dplyr::case_when(stringr::str_sub(mouse.id.8, 1, 1) %in% c("m","M") ~ "m",
stringr::str_sub(mouse.id.8, 1, 1) %in% c("f","F") ~ "f",
TRUE ~ "unknown")) %>%
dplyr::mutate(predicted.sex = dplyr::case_when(predicted.sex.1 == "m" ~ "m",
predicted.sex.3 == "m" ~ "m",
predicted.sex.4 == "m" ~ "m",
predicted.sex.5 == "m" ~ "m",
predicted.sex.6 == "m" ~ "m",
predicted.sex.7 == "m" ~ "m",
predicted.sex.8 == "m" ~ "m",
predicted.sex.1 == "f" ~ "f",
predicted.sex.3 == "f" ~ "f",
predicted.sex.4 == "f" ~ "f",
predicted.sex.5 == "f" ~ "f",
predicted.sex.6 == "f" ~ "f",
predicted.sex.7 == "f" ~ "f",
predicted.sex.8 == "f" ~ "f",
TRUE ~ "unknown"))
# Removing previously "unknown" samples from initial results and binding newly inferred samples
predicted.sexes.strings <- prelim.predicted.sexes %>%
dplyr::filter(predicted.sex != "unknown") %>%
dplyr::bind_rows(.,digit.trim)
## Taking the first marker as a sample and tabulating the number of samples for each predicted sex
predicted.sex.table <- predicted.sexes.strings %>%
dplyr::filter(marker %in% unique(prelim.predicted.sexes$marker)[1]) %>%
dplyr::select(sample, predicted.sex, bg) %>%
dplyr::group_by(predicted.sex) %>%
dplyr::count()
```
This captured `r predicted.sex.table[which(predicted.sex.table$predicted.sex == "f"),2]$n` female samples, `r predicted.sex.table[which(predicted.sex.table$predicted.sex == "m"),2]$n` male samples, leaving `r predicted.sex.table[which(predicted.sex.table$predicted.sex == "unknown"),2]$n` samples of unknown predicted sex from nomenclature alone.
```{r Predicted sex table}
# Table of samples for which sex could not be predicted from sample name alone. Using one marker is fine as an example as the sample info for each marker is identical.
predicted.sexes.strings %>%
dplyr::filter(predicted.sex == "unknown",
marker == predicted.sexes.strings$marker[[1]]) %>%
dplyr::select(sample, bg)
```
After predicting the sexes of the vast majority of reference samples, we visualized the average probe intensity among X Chromosome markers for each sample, labeling them by predicted sex. Samples colored black were unabled to have their sex inferred by the sample name, but cluster well with mice for which sex could be inferred. Conversely, some samples' predicted sex is discordant with X and Y Chromosome marker intensities (i.e. blue samples that cluster with mostly orange samples, and vice versa). Mouse over individual dots to view the sample, as well as whether it was flagged for having many markers with missing genotype information. In many cases, pulling substrings of sample names as the sex of the sample was too sensitive and misclassified samples.
```{r Predicted sex visualization}
# Input: Sex chromosome probe intensities for each marker with 1) marker metdata, 2) marker and sample flags, 3) background and sex predictions
Xchr.int <- predicted.sexes.strings %>%
dplyr::ungroup() %>%
dplyr::filter(marker_flag != "FLAG",
chr == "X") %>%
dplyr::mutate(x.chr.int = x_int + y_int) %>%
dplyr::group_by(sample, predicted.sex, high_missing_sample) %>%
dplyr::summarise(mean.x.chr.int = mean(x.chr.int))
# Expected output: Sample-averaged summed x- and y-channel probe intensities for all chromosome X markers. Note: replicated sample information collapses at this step. This is tolerable under the assumption that the samples with identical names are in fact duplicates of the same individual.
# sample predicted.sex high_missing_sample mean.x.chr.int
# <chr> <chr> <chr> <dbl>
# 1 A.Jf f "" 1.06
# 2 A.Jf0374 f "" 1.03
# 3 A.Jf0374.1 f "" 0.974
# 4 A.Jf0374.2 f "" 1.01
# 5 A.Jm0111 m "" 0.799
# 6 A.Jm0417 m "" 0.786
Ychr.int <- predicted.sexes.strings %>%
dplyr::ungroup() %>%
dplyr::filter(marker_flag != "FLAG",
chr == "Y") %>%
dplyr::group_by(sample, predicted.sex, high_missing_sample) %>%
dplyr::summarise(mean.y.int = mean(y_int))
# Expected output: Sample-averaged y-channel probe intensities for all chromosome Y markers. Note: replicated sample information collapses at this step. This is tolerable under the assumption that the samples with identical names are in fact duplicates of the same individual.
# sample predicted.sex high_missing_sample mean.y.int
# <chr> <chr> <chr> <dbl>
# 1 A.Jf f "" 0.046
# 2 A.Jf0374 f "" 0.0391
# 3 A.Jf0374.1 f "" 0.0571
# 4 A.Jf0374.2 f "" 0.0526
# 5 A.Jm0111 m "" 0.395
# 6 A.Jm0417 m "" 0.412
# Column binding the two intensities if the sample information matches
if(unique(Xchr.int$sample == Ychr.int$sample) == TRUE){
sex.chr.intensities <- cbind(Xchr.int,Ychr.int$mean.y.int)
colnames(sex.chr.intensities) <- c("sample","predicted.sex","bad_sample","sumxy_int","y_int")
}
# Interactive visualization of sex prediction results. Sample are colored according to predicted sex. "Unknown" samples are plotted black, and flagged/bad samples are triangles.
predicted.sex.plot.palettes <- sex.chr.intensities %>%
dplyr::ungroup() %>%
dplyr::distinct(sample, predicted.sex, bad_sample) %>%
dplyr::mutate(predicted.sex.palette = dplyr::case_when(predicted.sex == "f" ~ "#5856b7",
predicted.sex == "m" ~ "#eeb868",
predicted.sex == "unknown" ~ "black"))
predicted.sex.palette <- predicted.sex.plot.palettes$predicted.sex.palette
names(predicted.sex.palette) <- predicted.sex.plot.palettes$predicted.sex
mean.x.intensities.by.sex.plot <- ggplot(sex.chr.intensities,
mapping = aes(x = sumxy_int,
y = y_int,
colour = predicted.sex,
shape = bad_sample,
text = sample,
label = bad_sample)) +
geom_point() +
scale_colour_manual(values = predicted.sex.palette) +
# facet_grid(.~chr) +
QCtheme
ggplotly(mean.x.intensities.by.sex.plot,
tooltip = c("text","label"))
```
Because the split between inferred sexes of samples was so distinct, we used k-means clustering to quickly match the clusters to sexed samples and assign or re-assign sexes to samples with unknown or apparently incorrect sex information, respectively. Samples highlighted above were also re-evaluated using strain-specific marker information.