-
Notifications
You must be signed in to change notification settings - Fork 1
/
200_fwsw_notebook.Rmd
11874 lines (9584 loc) · 519 KB
/
200_fwsw_notebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "FWSW lab Notebook"
author: "Sarah P. Flanagan"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_notebook
editor_options:
chunk_output_type: console
---
```{r setup}
knitr::opts_knit$set(root.dir='../fwsw_results/', fig.pos='H')
```
```{r source}
source("../../gwscaR/R/gwscaR.R")
source("../../gwscaR/R/gwscaR_plot.R")
source("../../gwscaR/R/gwscaR_utility.R")
source("../../gwscaR/R/gwscaR_fsts.R")
source("../../gwscaR/R/gwscaR_popgen.R")
source("../../gwscaR/R/vcf2dadi.R")
source("../R/250_dadi_analysis.R")
library(knitr)
pop.list<-c("ALFW","ALST","FLCC","FLLG","LAFW","TXCC","TXFW")
```
# 30 October 2020
A few runs are still going but I'm moving the ones that are done to move forward a bit. Here is what is remaining on abacus:
- ALFW_ALST_SC2mG 1-10 (9 is still running)
- LAFW_ALST_SC2mG 11-21 (15 is still running)
- LAFW_ALST_SC2NG 11-21 (17 is still running)
- FLLG_FLFW_IM2m 11-20 (20 is still running)
- FLLG_FLFW_IM2mG 11-20 (20 is still running)
- FLLG_FLFW_IM2NG 11-20 (16 is still running)
- TXFW_TXCC_IM2mG 1-20 (18 is still running)
- TXFW_TXCC_IM2NG 1-20 (12 is still running)
But for the TX ones I scp'ed everything anyway so that I'll have at least reps 1-10.
Now I've updated dadi doc. Intriguingly, the AL and LA pops are best modeled by SC (with 2mG or 2N2mG). TX and FL best are both IM2m.
# 3 September 2020
A number of models had finished running so I moved them to C001KR and re-analyzed the results. I've found that ALFW-ALST simple comparisons for both IM and SC have dAIC <10, so I'm going to run the SC complex models as well (#1-20).
IM is the best for the TX populations so I'll start those (#1-20).
# 27 August 2020
For LAFW_ALFW, FLLG_FLFW, and ALFW_ALST, it took 2-3 days to run 1 round of each of the more complex models, so I'll start rounds 2-11 now.
# 25 August 2020
The TX simple models were finished running but I realized that some of them had not ever completed (didn't have all the way through 40 reps) and some of them hadn't run. So I'm re-running some of each of those models on abacus. The one extra LA run that I started yesterday is complete, so after these runs from today are finished I'll have 20 reps of AM, IM, SC, and SI for each pairwise comparison. From there I'll choose which complex models to run for TX.
The complex runs from yesterday are still running but are almost done.
# 24 August 2020
The AL and LA comparisons are done running (just that SC, SI, AM, and IM models) for all 20 reps. Taking a look at the updated 250_fwsw_dadi doc, I first observed that one of the LA AM runs was apparently incomplete -- it's run #1, and I'm going to re-start that on abacus.
Other things to note:
- the FLFW-FLCC simple models are all quite similar to each other, but the best one is the IM model -- it has both the highest median log likelihood and the highest log likelihood recorded.
- IM is also the best of the four simple models for ALFW-ALST.
- SC is the best of the four simple models for LAFW-ALST.
- TX is still running but it looks like IM will be the winner there.
So now I need to run the more complex models. I've started one run of each of the complex models for the FL, AL, and LA combinations mainly to get a sense for how long this might take. I've got lots of time -- the deadline for resubmission has been pushed to 30 Nov (of course I would prefer to be done sooner than that).
# 13 August 2020
Rounds 1-10 are done for the simple models (SI, SC, AM, and IM) for all four comparisons, and I've copied them to C001KR from abacus. I deleted the FLLG_FLCC ones but not the others.
I started rounds 10-21 for the simple models for the pour pairwise comparisons on abacus.
The runs on rccuser are still going (which was all models for all comparisons).
Re-knit the document and it's looking like there was clearly migration in all population pairs, but the SI model is pretty high up there for the FL comparisons (not the best though).
# 3 August 2020
I installed stacks v1.40 on C001KR and copied Stacks data to BigData/ so that I can re-filter data for dadi.
Then ran populations with the following command:
```
populations -b 2 -P ./stacks -M ./stacks/fwsw_sub_strat.txt -t 5 --min_maf 0 --vcf
```
I also downloaded all the results from abacus and rccuser, stopped the runs on rccuser, and archived all of the files (saved in tar.gz and deleted the un-zipped ones).
I downloaded the code from Rougemont for the folded models.
TODO: update code to make it a bit easier to run (maybe)
Once populations is done, re-filter for dadi.
# 30 June 2020
I've done a final update of the dadi runs, scp-ing all of the ones from abacus and rccuser so that I've got as complete a set as possible.
# 29 June 2020
I moved some sets of results from rccuser and abacus even though they aren't totally finished, mainly because I want to check in on the dadi results.
I also realized that maybe I should be using the very final round/replicate from each run, and that maybe that will solve some of my issues with the dadi plots...this ended up changing some of the results (FLFW and FLCC are now "SI" model!) and we'll see what the output looks like in the end.
UGH No, now it's FLFW_ALST and FLFW_TXCC with problems (rather than FLFW_TXFW and FLFW_TXCC). but the model running on C001KR is done! So I can install updates and restart, FINALLY.
Ok, so I'm an idiot, and I hadn't actually been using the median model for any other than FLFW_FLCC. So I'll do that and go back to not using the final replicate and see how that goes. - that solved the problem!
# 25 June 2020
FLCC_ALFW #13 finally done on abacus, so I moved it.
The permutations are also finished so I can look into that.
```{r}
perms<-readRDS("permuted_fsts_22062020.RDS")
sigPerm<-lapply(perms,function(dat){
return(dat[which(dat$adjP<0.05),])
})
```
I think one of the issues with the permutations is that there are many/several loci with Fst=0 with the correct labels but when the labels are permuted they end up with Fst >0 all the time, so those ones end up being outliers. I could re-vamp the permutations again to only go for loci with actual Fst > the distribution of permutations, but I'm not sure it's worth the effort.
Are any other loci in the dataset in that 6-bp region? Yes, >200.
# 24 June 2020
One of the papers I read says that LRRCC1 is tightly linked to CA2 in zebrafish, and CA2 is one of the putative freshwater genes
```{r}
fw_SNPinfo<-readRDS("fw_SNPinfo_noFL.RDS")
outliers<-list(xtx=fw_SNPinfo$ID[which(fw_SNPinfo$XtX_noFL >= quantile(fw_SNPinfo$XtX_noFL,0.99,na.rm = TRUE))],
salBF=fw_SNPinfo$ID[which(fw_SNPinfo$logSalBF_noFL>=
quantile(fw_SNPinfo$logSalBF_noFL,0.99,na.rm = TRUE))],
permutations=fw_SNPinfo$ID[
rowSums(fw_SNPinfo[,c("perm_TX","perm_AL","perm_LA")])==3],
pcadapt=fw_SNPinfo$ID[which(fw_SNPinfo$pcadaptQ<0.01)],
Alabama=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05)],
Louisiana=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_LA_P < 0.05)],
Texas=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_TX_P < 0.05)],
Florida=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_FL_P < 0.05)],
sharedStacks=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05 &
fw_SNPinfo$stacks_LA_P < 0.05 &
fw_SNPinfo$stacks_TX_P < 0.05)])
lg6<-fw_SNPinfo[fw_SNPinfo$Chrom=="LG6",]
outdat<-data.frame(rbind(lg6[which(lg6$ID %in% outliers$sharedStacks),],
lg6[which(lg6$ID %in% outliers$salBF),]),
stringsAsFactors = FALSE)
gff.name<-"ssc_2016_12_20_chromlevel.gff.gz"
if(length(grep("gz",gff.name))>0){
gff<-read.delim(gzfile(paste("../../scovelli_genome/",gff.name,sep="")),header=F)
} else{
gff<-read.delim(paste("../../scovelli_genome/",gff.name,sep=""),header=F)
}
colnames(gff)<-c("seqname","source","feature","start","end","score",
"strand","frame","attribute")
gff6<-gff[gff$seqname=="LG6",]
put_genes<-read.delim("putative_genes.txt",stringsAsFactors = FALSE)
cas<-unlist(strsplit(put_genes$Scovelli_geneID[put_genes$Gene=="CA"],","))
cas_gff<-gff[unlist(lapply(cas,grep,x=gff$attribute)),]
cas_gff6<-cas_gff[cas_gff$seqname == "LG6",]
```
Ok, they're not near each other here, even if they're on the same LG -- they're `r min(cas_gff6$start)-778986` (6448813) bp apart.
What about other genes nearby?
```{r}
nearby_genes<-gff6$attribute[gff6$feature=="gene" & gff6$start >(778986-100000) & gff6$start < (778986+100000)]
nearby_genes<-gsub("ID=(.*);Name.*","\\1",nearby_genes)
genome.blast<-read.csv("../../scovelli_genome/ssc_2016_12_20_cds_nr_blast_results.csv",
skip=1,header=T)#I saved it as a csv
genome.blast[genome.blast$sscv4_gene_ID %in% nearby_genes,]
```
None of those are putative fw genes, so I'll just move on.
In the supplement there are inconsistencies around how many SNPs are outliers in the stacks analysis, so i need to fix that.
# 23 June 2020
The pemutations are still running this morning but hopefully they'll be done by the afternoon so I can dig into it. They seem to have finished, whoot...but I screwed up and they didn't save properly. facepalm.
```{r}
perms<-readRDS("permuted_fsts_22062020.RDS")
```
So I'll start it again and look at them tomorrow.
Right now I want to get to the bottom of the PCAdapt weirdness. Ok, so I've discovered that with min.maf=0.05, only 781 SNPs pass PCAdapt's pruning. With min.maf=0.01 only 2358 SNPs pass the pruning. With min.maf=0, 12094 pass. With an alpha level of 0.01, there are still 3330 outliers, and most of these are ones with really small minor allele frequencies (median=0.014). But many of these have negative z-scores -- perhaps I can focus my attention only on those with positive z-scores. No, that probably won't work. What I can do is restrict attention to those loci with statistics in the top 99% quantile and then only keep those with qvalues <= alpha (0.01)
This sort of worked (code below for posterity) but I decided to revert to the old way with lots being NA because otherwise the overall analysis seems a bit whack.
```{r}
statOut<-which(res$stat >= quantile(res$stat,0.99,na.rm=TRUE))
outliers<-statOut[statOut %in% outliers]
```
# 22 June 2020
A few things:
1. I see what Adam was saying about the permutation outliers being small Fst values -- it's on the Manhattan plot. Perhaps there needs to be a different way of identifying these as outliers.
2. PCAdapt doesn't share many of the same outliers because those loci are removed by the minor allele frequency cutoff. If I remove this cutoff, then thousands of loci have really small q-values, which also doesn't seem quite right.
3. I need to look into the set of outliers that shows up on LG6.
Working on 2:
I'll try a min.maf=0.01 and see if that helps include more outliers without creating chaos.
Working on 3:
I need to identify which outliers are coming up on LG6, how far apart they are, and whether they're annotated. Then I could visualize it somehow -- some packages exist, but they seem to be bioconductor related. This package might be useful: https://bioconductor.org/packages/release/bioc/vignettes/genomation/inst/doc/GenomationManual.html
```{r}
fw_SNPinfo<-readRDS("fw_SNPinfo.RDS")
outliers<-list(xtx=fw_SNPinfo$ID[fw_SNPinfo$XtX >= quantile(fw_SNPinfo$XtX,0.99,na.rm = TRUE)],
salBF=fw_SNPinfo$ID[fw_SNPinfo$logSalBF>=
quantile(fw_SNPinfo$logSalBF,0.99,na.rm = TRUE)],
permutations=fw_SNPinfo$ID[
rowSums(fw_SNPinfo[,c("perm_TX","perm_AL","perm_LA")])==3],
pcadapt=fw_SNPinfo$ID[which(fw_SNPinfo$pcadaptQ<0.01)],
Alabama=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05)],
Louisiana=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_LA_P < 0.05)],
Texas=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_TX_P < 0.05)],
Florida=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_FL_P < 0.05)],
sharedStacks=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05 &
fw_SNPinfo$stacks_LA_P < 0.05 &
fw_SNPinfo$stacks_TX_P < 0.05)])
```
There are `r nrow(dim(fw_SNPinfo[fw_SNPinfo$Chrom=="LG6",]))` SNPs on LG6 and `r nrow(fw_SNPinfo[fw_SNPinfo$ID %in% unlist(outliers) & fw_SNPinfo$Chrom=="LG6",])` are outliers in one or more analyses.
```{r}
cols<-c(perm=alpha('#e41a1c',0.75),sal=alpha('#377eb8',0.75),pc=alpha('#a65628',0.75),
stacks=alpha('#f781bf',0.75),xtx=alpha('#ff7f00',0.75))
lg6<-fw_SNPinfo[fw_SNPinfo$Chrom=="LG6",]
lg6$avgFst<-rowMeans(lg6[,c("stacks_AL","stacks_LA","stacks_TX")],na.rm = TRUE)
plot_dat<-fst.plot(lg6,scaffs.to.plot = "LG6",fst.name = "avgFst",
chrom.name = "Chrom",bp.name = "Pos",axis.size = 1,pch=19,pt.cex = 1.5)
points(plot_dat$plot.pos[grep("SSC",plot_dat$SSCID)],
plot_dat$avgFst[grep("SSC",plot_dat$SSCID)],
col="black",cex=2,pch=19,lwd=2)
points(plot_dat$plot.pos[grep("UTR",plot_dat$region)],
plot_dat$avgFst[grep("UTR",plot_dat$region)],
col="red",cex=2,pch=19,lwd=2)
points(plot_dat$plot.pos[which(plot_dat$stacks_AL_P < 0.05 &
plot_dat$stacks_LA_P < 0.05 &
plot_dat$stacks_TX_P < 0.05 )],
plot_dat$avgFst[which(plot_dat$stacks_AL_P < 0.05 &
plot_dat$stacks_LA_P < 0.05 &
plot_dat$stacks_TX_P < 0.05)],
col=cols["stacks"],cex=2,pch=8,lwd=3)
points(plot_dat$plot.pos[plot_dat$logSalBF>=quantile(plot_dat$logSalBF,0.99)],
plot_dat$avgFst[plot_dat$logSalBF>=quantile(plot_dat$logSalBF,0.99)],
col=cols["sal"],cex=2,pch=2,lwd=2)
```
Ok, that's a start for visualizing. But I wonder if I can show the genomic information below too
```{r}
gff.name<-"ssc_2016_12_20_chromlevel.gff.gz"
if(length(grep("gz",gff.name))>0){
gff<-read.delim(gzfile(paste("../../scovelli_genome/",gff.name,sep="")),header=F)
} else{
gff<-read.delim(paste("../../scovelli_genome/",gff.name,sep=""),header=F)
}
colnames(gff)<-c("seqname","source","feature","start","end","score",
"strand","frame","attribute")
gff6<-gff[gff$seqname=="LG6",]
```
I should identify what the genes are in these regions.
```{r}
lg6$description[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]
lg6$description[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE) &
!is.na(lg6$description)]
lg6$abbr<-NA
lg6$abbr[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]<-c("WRNIP1",
"uncharacterised",NA,
"PTPRF6",
"LRRCC1",NA)
lg6$abbr[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE) &
!is.na(lg6$description)]<-c("RP1","WRNIP1",NA,"LRRCC1")
```
```{r}
plot(x=c(min(gff6$start),max(gff6$end)),
y=c(0,1),type='n', axes=FALSE,
xlab="",ylab="")#,xlim=c(200,2000000)
abline(h=0.5)
rect(gff6$start[gff6$feature=="gene"],0.25,
gff6$end[gff6$feature=="gene"],0.75,col = "grey",border="grey")
stacksout<-lg6$Pos[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]
abline(v=stacksout,col=cols["stacks"],lwd=2)
abline(v=lg6$Pos[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE)],
col=cols["sal"],lty=2,lwd=2)
text(x =lg6$Pos[!is.na(lg6$abbr)],y=rep(1.1,nrow(lg6[!is.na(lg6$abbr),])),
lg6$abbr[!is.na(lg6$abbr)],xpd=TRUE,srt=90, pos=3)
```
```{r}
plot(x=c(min(gff6$start),max(gff6$end)),
y=c(0,1),type='n', axes=FALSE,
xlab="",ylab="",xlim=c(200,2000000))
abline(h=0.5)
rect(gff6$start[gff6$feature=="gene"],0.25,
gff6$end[gff6$feature=="gene"],0.75,col = "grey",border="grey")
stacksout<-lg6$Pos[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]
abline(v=stacksout,col=cols["stacks"],lwd=2)
abline(v=lg6$Pos[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE)],
col=cols["sal"],lty=2,lwd=2)
text(x =lg6$Pos[!is.na(lg6$abbr)],y=rep(1.1,nrow(lg6[!is.na(lg6$abbr),])),
lg6$abbr[!is.na(lg6$abbr)],xpd=TRUE,srt=90, pos=3)
```
Are the ones in gene regions in UTRs or in coding regions?
```{r}
lg6$region[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]
lg6$region[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE) &
!is.na(lg6$description)]
```
They are all in coding regions. Weirdly, one of the SNPs is apparently in a gene if we look at the gff but it's got NAs instead. I'll fix that and then re-plot the above and create a table.
```{r}
outdat<-data.frame(rbind(lg6[which(lg6$stacks_AL_P < 0.05&
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 ),],
lg6[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE) &
!is.na(lg6$description),]),stringsAsFactors = FALSE)
outdat<-unique(outdat)
genome.blast<-read.csv("../../scovelli_genome/ssc_2016_12_20_cds_nr_blast_results.csv",
skip=1,header=T)#I saved it as a csv
outdat$SSCID[outdat$ID==9871]<-"SSCG00000016415"
outdat$description[outdat$ID==9871]<-genome.blast$blastp_hit_description[genome.blast$sscv4_gene_ID=="SSCG00000016415"]
outdat$abbr[outdat$ID==9871]<-"GPC5"
# replot
plot(x=c(min(gff6$start),max(gff6$end)),
y=c(0,1),type='n', axes=FALSE,
xlab="",ylab="",xlim=c(200,2000000))
abline(h=0.5)
rect(gff6$start[gff6$feature=="gene"],0.25,
gff6$end[gff6$feature=="gene"],0.75,col = "grey",border="grey")
stacksout<-lg6$Pos[which(lg6$stacks_AL_P < 0.05 &
lg6$stacks_LA_P < 0.05 &
lg6$stacks_TX_P < 0.05 )]
abline(v=stacksout,col=cols["stacks"],lwd=2)
abline(v=lg6$Pos[lg6$logSalBF>=quantile(lg6$logSalBF,0.99,na.rm = TRUE)],
col=cols["sal"],lty=2,lwd=2)
text(x =outdat$Pos[!is.na(outdat$abbr)],y=rep(1.1,nrow(outdat[!is.na(outdat$abbr),])),
outdat$abbr[!is.na(outdat$abbr)],xpd=TRUE,srt=90, pos=3)
```
I realize I should use the outlier thing instead of calling quartile every time for salinity loci because the quartile from LG6 will be different from overall...fixing that in the supplement.
I'm a bit concerned that the annotation wasn't there for the one locus, so let me try the function again but just for LG6 outliers. ...but wait, maybe I was just dumb and confused? because looking at it now I don't think that it is in a gene.
```{r}
lg6_annotate<-annotate_snps(outdat,gff,genome.blast,ID="ID",
chrom="Chrom",bp="BP",pos = "Pos")
```
Also, what if this isn't the best way to do it? I could look for the element that has the closest start/end points and see if it falls between them. But I don't actually think this is necessary.
Working on #1, the permutation analysis:
I could also return the SEM and do one-sample t-tests, apply a bonferroni correction, and then look at loci that are significant. Ok, I'm running this now, we'll see how it goes.
# 19 June 2020
Goals:
1. Update text with Adam's comments
2. Look into Bayenv results without Florida pops.
dadi updates:
I was able to transfer some of the dadi results from abacus to C001KR. Also started ALFW_TXFW_13 (run 660) and FLCC_TXCC_14 (run 661, all models except SC2N2mG, which is still running with # 13 in run 619). I noticed that FLLG_TXFW nothing's been updated since Jun 15 and the only log remaining is SC2N2mG.log.txt, which was updated on 10 June. So I'm thinking of stopping that one (job 626) and re-starting just SC2N2mG. - yep, done.
Analyzing Bayenv no Florida results:
XTX and salinity are correlated with a correlation coefficient ~0.5. They share ~30-50 outliers and the analysis without Florida has a lot less noise. There's a peak in LG6 that looks like it might be shared by the stacks Fst outliers -- this is the next thing to check.
```{r}
fw_SNPinfo<-readRDS("fw_SNPinfo.RDS")
bayenv_noFL<-read.delim("bayenv_output_noFL.txt",header=TRUE)
bothXtX<-bayenv_noFL$ID[bayenv_noFL$XtX_FL>=quantile(bayenv_noFL$XtX_FL,0.99) &
bayenv_noFL$XtX_noFL>=quantile(bayenv_noFL$XtX_noFL,0.99)]
bothSal<-bayenv_noFL$ID[bayenv_noFL$logSalBF_FL>=quantile(bayenv_noFL$logSalBF_FL,0.99) &
bayenv_noFL$logSalBF_noFL>=quantile(bayenv_noFL$logSalBF_noFL,0.99)]
sharedStacks=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05 &
fw_SNPinfo$stacks_LA_P < 0.05 &
fw_SNPinfo$stacks_TX_P < 0.05)]
sharedPerm=fw_SNPinfo$ID[
rowSums(fw_SNPinfo[,c("perm_TX","perm_AL","perm_LA")])==3]
pcout<-fw_SNPinfo$ID[which(fw_SNPinfo$pcadaptQ<0.01)]
bothXtX[bothXtX %in% sharedPerm]
bothXtX[bothXtX %in% sharedStacks]
bothXtX[bothXtX %in% pcout]
fw_SNPinfo[fw_SNPinfo$ID==11220,]
fw_SNPinfo[fw_SNPinfo$ID==50576,]
fw_SNPinfo[fw_SNPinfo$ID==50892,]
fw_SNPinfo[fw_SNPinfo$ID==46960,]
fw_SNPinfo[fw_SNPinfo$ID==7507,]
fw_SNPinfo[fw_SNPinfo$ID==7925,]
```
So it looks like some of the ones that are shared among some analyses are not in others (e.g., PCAdapt) because they were removed from that analysis for one reason or another.
# 18 June 2020
Goals:
1. figure out Bayenv matrix issues
2. annotate shared stacks/permutation outliers
3. update supplement 3 (include histograms of Stacks outliers and manhattan plots of permutation outliers)
4. email Adam
Starting with # 1:
This command:
`../../scripts/run_bayenv2_matrix_general.sh MATRIX noFL ~/Programs/bayenv/ 5`
is not properly creating the matrices, so let's dig into it a bit. It runs bayenv in 'matrix estimation mode', which requires input of the SNPSFILE and the number of populations. So, is the SNPSFILE generated correctly? It seems to be, it's got 24206 lines (12103 * 2) and 5 columns, and they appear to be tab-separated. BUT! it says there should only be polymorphic sites, and looking at the SNPSFILE there are some loci that had another allele in the FL pops but is not in these pops, so those should be removed. I'm adding that to the `SNPSFILEfromPLINKfrq.R` script, and removing monomorphic sites results in a dataset of 10743 SNPs and a SNPSFILE with 21486 rows. I'm also saving the SNP names to a file when creating the SNPSFILE.
Now it seems to be working!
I've also updated supplement 3 to show more graphs of stacks and permutation outliers, and this should also update the annotations of the outliers (in the minimal sense).
So let's think about the Fst outliers and their annotations.
```{r}
fw_SNPinfo<-readRDS("fw_SNPinfo.RDS")
outliers<-list(xtx=fw_SNPinfo$ID[fw_SNPinfo$XtX >= quantile(fw_SNPinfo$XtX,0.99,na.rm = TRUE)],
salBF=fw_SNPinfo$ID[fw_SNPinfo$logSalBF>=
quantile(fw_SNPinfo$logSalBF,0.99,na.rm = TRUE)],
permutations=fw_SNPinfo$ID[
rowSums(fw_SNPinfo[,c("perm_TX","perm_AL","perm_LA")])==3],
pcadapt=fw_SNPinfo$ID[which(fw_SNPinfo$pcadaptQ<0.01)],
Alabama=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05)],
Louisiana=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_LA_P < 0.05)],
Texas=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_TX_P < 0.05)],
Florida=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_FL_P < 0.05)],
sharedStacks=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05 &
fw_SNPinfo$stacks_LA_P < 0.05 &
fw_SNPinfo$stacks_TX_P < 0.05)])
```
```{r}
annInfo<-fw_SNPinfo[,c("ID","region")]
annInfo$region<-as.character(annInfo$region)
annInfo$region[grep("UTR",annInfo$region)]<-"regulatory"
annInfo$region[grep("gene",annInfo$region)]<-"coding"
annInfo$region[annInfo$region %in% "contig"]<-"non-coding"
annInfo$region[annInfo$region %in% "scaffNotFound"]<-"unkown"
annInfo$outlier<-"not-outlier"
annInfo$outlier[annInfo$ID %in% outliers$sharedStacks]<-"outlier"
annInfo$salgene<-"not-putative"
annInfo$salgene[annInfo$ID %in% fw_SNPinfo$ID[!is.na(fw_SNPinfo$Gene)]]<-"putative"
fisher.test(table(annInfo$region,annInfo$outlier))
fisher.test(table(annInfo$outlier,annInfo$salgene))
```
```{r}
fst_outs<-outliers$permutations[outliers$permutations %in% outliers$sharedStacks]
fst_out_dat<-fw_SNPinfo[fw_SNPinfo$ID %in% fst_outs,
c("ID","SSCID","Chrom","BP","REF","ALT","region","description")]
fst_out_dat$region<-as.character(fst_out_dat$region)
fst_out_dat$region[grep("UTR",fst_out_dat$region)]<-"regulatory"
fst_out_dat$region[grep("gene",fst_out_dat$region)]<-"coding"
fst_out_dat$region[fst_out_dat$region %in% "contig"]<-"non-coding"
fst_out_dat$region[fst_out_dat$region %in% "scaffNotFound"]<-"unkown"
fst_out_dat$SSCID<-gsub(";","; ",fst_out_dat$SSCID)
kable(fst_out_dat,"latex",booktabs=TRUE,
caption="SNPs that were Fst outliers in all pairwise Texas, Alabama, and Louisiana Stacks and permutation analyses. Shown are the SNP ID in this dataset, the ID of the gene it is mapped to in the S. scovelli genome, its location and reference and alternative alleles. The final two columns show the type of genomic region the SNP is in and the gene description (if relevant).") %>%
kable_styling(latex_options="HOLD_position")%>%
column_spec(2,width="10em") %>%
column_spec(8,width="20em")
```
This should be good for a preliminary investigation.
# 17 June 2020
I got comments back from Adam, which were useful. He recommended the following:
1. Re-run bayenv without the Florida populations and compare the results to the one that I've already done.
- I started the bayenv run this AM after subsetting the files.
2. Focus on shared outliers from just the TX, AL, and LA Fst analyses
3. Revisit the permutations, as the Fst values are surprisingly small (and use those with the Stacks Fsts)
4. Make Fig 2 focused on Fsts (stacks + permutations) and increase font sizes and possibly change colors.
5. Make Fig 3 focused on Bayenv and increase font sizes and possibly change colors.
6. Have more discussion of shared outliers (possibly) and definitely the lack of genomic islands.
I think that possibly the positional data in the bayenv analyses has been wrong this whole time...oh no, because I merged the locus IDs and then pulled it from fw_SNPinfo, so it's ok. Phew.
Digging into the permutations:
```{r}
perms<-readRDS("permuted_fsts.RDS")
par(mfrow=c(4,1))
plts<-lapply(perms,fst.plot,scaffs.to.plot = lgs,fst.name = "Fst",
chrom.name = "Chrom",bp.name = "Pos",
pch=19,pt.cex = 1,y.lim=c(0,1))
```
I see what Adam means, the Florida comparison's Fst values are really small, and these are the actuals.
```{r}
perms_new<-readRDS("permuted_fsts_29052020.RDS")
par(mfrow=c(4,1))
plts<-lapply(perms,fst.plot,scaffs.to.plot = lgs,fst.name = "Fst",
chrom.name = "Chrom",bp.name = "Pos",
pch=19,pt.cex = 1,y.lim=c(0,1))
```
It's true for both of these permutations. It's strange...
*Note: population_whitelist includes ones run with the radiator-filtered whitelist on all 16 populations.*
Now, looking into the Fst values, I'm noticing that we have way too many loci included, and somehow we've got multiple SNPs per locus! I have no idea how that happened. It's almost like the fwsw.tx etc haven't had the correct SNPs retained...and they didn't! But it doesn't matter because it's all ok in the fw_SNPinfo container.
I've now saved the subsetted files in the populations_subset75 directory. The 'raw' Fst values from Stacks includes negative values, which is silly, but I can plot histograms and see what's going on there.
```{r}
par(mfrow=c(2,2))
hist(fwsw.tx$Fst,xlim=c(-0.2,1),breaks=seq(-1,1,0.1))
hist(fwsw.al$Fst,xlim=c(-0.2,1),breaks=seq(-1,1,0.1))
hist(fwsw.la$Fst,xlim=c(-0.2,1),breaks=seq(-1,1,0.1))
hist(fwsw.fl$Fst,xlim=c(-0.2,1),breaks=seq(-1,1,0.1))
```
```{r}
par(mfrow=c(2,2))
hist(fwsw.tx$Corrected.AMOVA.Fst,xlim=c(0,1),breaks=seq(-1,1,0.1))
hist(fwsw.al$Corrected.AMOVA.Fst,xlim=c(0,1),breaks=seq(-1,1,0.1))
hist(fwsw.la$Corrected.AMOVA.Fst,xlim=c(0,1),breaks=seq(-1,1,0.1))
hist(fwsw.fl$Corrected.AMOVA.Fst,xlim=c(0,1),breaks=seq(-1,1,0.1))
```
The reason I'm doing this is to try to see how different the permutations are. They do show substantially zero-skewed distributions, so that's not nothing. But the genome-wide plots of permutations above are much more damning, and more troubling really. I'm not sure this is the correct approach...let's dive into the details of the permutations a bit more.
None of the permutations have lost loci and seem to have approximately the same numbers as the other analyses -- but in different orders!
```{r}
lapply(perms,nrow)
lapply(perms,function(x) nrow(x[x$NumAlleles>1,]))
```
These seem to be in the order:
[[1]] Texas
[[2]] Florida
[[3]] Alabama
[[4]] Louisiana
Oh, this is the order their histograms are plotted too.
```{r}
lapply(perms,summary)
```
Ok, I think I've confinced myself that the permutations are done correctly. But the fact remains that the outliers don't match other things.
What if I ignore FL? -- this increases the number of permutation outliers. I could make an upset plot of the permutation outliers and compare them to the shared Fst outliers
```{r}
outliers<-list(permutations=fw_SNPinfo$ID[
rowSums(fw_SNPinfo[,c("perm_TX","perm_FL","perm_AL","perm_LA")])==4],
Alabama=fw_SNPinfo$ID[which(fw_SNPinfo$perm_AL==1)],
Louisiana=fw_SNPinfo$ID[which(fw_SNPinfo$perm_LA ==1)],
Texas=fw_SNPinfo$ID[which(fw_SNPinfo$perm_TX==1)],
Florida=fw_SNPinfo$ID[which(fw_SNPinfo$perm_FL==1)],
sharedStacks=fw_SNPinfo$ID[which(fw_SNPinfo$stacks_AL_P < 0.05 &
fw_SNPinfo$stacks_LA_P < 0.05 &
fw_SNPinfo$stacks_TX_P < 0.05)])
cols<-c(permutations='#e41a1c',salBF='#377eb8',pcadapt='#a65628',
xtx='#ff7f00',sharedStacks='#f781bf',
Alabama='#af8dc3',Louisiana='#e7d4e8',Texas='#762a83',Florida='#1b7837')
upset(fromList(outliers),sets=c("Texas","Alabama","Louisiana","Florida","sharedStacks"),
point.size=5,line.size=2,mainbar.y.label = "Number of Shared Outliers",
sets.x.label = "# Outliers",text.scale=rep(2,6),
sets.bar.color =cols[c("Texas","Alabama","Louisiana","Florida","sharedStacks")],
margin1scale = 0.2,
sets.pt.color=cols[c("Texas","Alabama","Louisiana","Florida","sharedStacks")],
keep.order = TRUE)
```
FOR WHATEVER REASON Bayenv is not liking my environ file that I'm giving it. It's standardized, tab-separated, and the populations are the columns and variables the rows. I've made sure it's got proper line endings, I've added an extra tab at the end of each row, and cannot for the life of me figure out what's wrong.
It's possible it's not the environ file that's causing the problem -- maybe there's an issue with the SNPFILEs? no, but the matrix file is empty!
I will need to figure out what's going on there.
Tomorrow:
- figure out Bayenv matrix issues
- annotate shared stacks/permutation outliers
- update supplement 3 (include histograms of Stacks outliers and manhattan plots of permutation outliers)
- email Adam
# 16 June 2020
ALST_LAFW # 14 is done on abacus and so is FLLG_ALFW_SC2N2mG_6 on C001KR (now only # 7 needs to finish running).
# 15 June 2020
Sweet, some runs on abacus are done:
- ALST_TXFW # 13
- ALST_LAFW # 13
- ALST_TXCC # 14
- ALST_ALFW # 14 finished during the day so I moved it & started another one.
I finally quit job 439 ALST_TXFW_12 & made sure all the data had been moved to C001KR.
Need to run:
- ALFW vs ALST (# 15) - wait for 628 to finish (just one more model)
- FLCC vs TXCC (# 14) - wait for 619 to finish (just one more model)
- FLCC vs TXFW (anything > 10) - running # 14 models 238-259 on abacus (job 645)
- FLCC vs LAFW (#14) models 212-233 (job 646)
For now that's what I'll go with, and hopefully some more of these models will finish running on abacus in the next couple days.
On rccuser FLCC_ALFW_SC2NG_1 is running, and ALFW_TXFW_2 IM2N and IM2nG, plus several FLCC_ALST_2 and FLCC_ALST_SC2N2mG_20, plus FLCC_ALFW_2_AM2NG and SC2mG, and FLCC_LAFW_2 (various models).
tail FLCC-ALFW_SC2NG_1_2.log -- Round 4, 24 of 40
tail FLCC_ALST_20.SC2N2mG.log.txt -- Round 3, 24 of 30 (number 2 is on Round 3, 6 of 30) -- hopefully these aren't interfering with each other, but they might be.
Other stuff:
- I need to run a maximum likelihood unrooted tree with the consensus in treemix to generate covariances. [done]
- I need to re-make the summary figure with the treemix FLLG label turned into FLFW. And change the colors of the tip labels. [done]
- re-run the dadi best models plotting script.
- The FLLG_TXFW and FLLG_TXCC jSFS plots look strange -- need to investigate that.
# 12 June 2020
I think I've decided that a summary table is better than trying to come up with a figure for the dadi results, at least at the moment. So I've added code to save that table to the Rmarkdown doc. I've also plotted 2D data vs model figures made by dadi for the best models and their optimized parameters, but some of them look really strange. Others look really nice. So that means, I think, that I haven't found very good runs of some of these models. Comparisons with bad plots:
FLLG_ALFW
FLLG_ALST
FLLG_LAFW
FLLG_TXFW
FLLG_TXCC
FLLG_ALFW
FLCC_ALST
FLCC_LAFW
ALFW_LAFW
ALFW_TXCC
But the regular 2D plots of them look fine, so it's probably something to do with my plotting function, and of course I didn't save a log. So I'll try running it again and this time save the output to a log file.
Looking at the log, it looks like I've passed some strange parameters including NAs, and I'm guessing that's the source of my issues. This arose from the simplification I made of using sumTabs to fill the outTable, but the columns were in different orders. So I think I've fixed that and I'm working on making the new figures.
To have more confidence in my current results, it would be really nice if I could easily know which runs were complete and move those from abacus and rccuser.
This would be done using some fun unix code
```{bash}
find . -type f -iname "$remote_file" -mtime +3
```
# 11 June 2020
My goals for today are:
1. create a generic script that I can launch easily/automatically to create joint SFS images from dadi best models.
2. decide on best way to present the dadi results in the main document.
- I find my current summary somewhat difficult to interpret
- I feel like I should be showing the model and data joint SFS figures.
3. outline/draft discussion.
4. check text about outliers results -- is it accurate/up-to-date?
I adjusted the python script to accept arguments, and I added R code to create the script at the end of the dadi Rmd. I didn't see an easy way of *just* plotting the model summary bit with the dadi code, but I might be able to use the Plotting_Functions.Plot_2D(fs, model_fit, prefix, "sym_mig") from the dadi_pipeline. The only possible issue is that it creates two figures, and I'm not sure how to capture this via command line.
Eventually I'll also need to improve the captions in the dadi document.
# 10 June 2020
Yesterday I finished up the treemix analyses, including the threepop and fourpop ones and re-making the main text figure. So the goals for today are to update the main text and to finish up/make progress on the dadi results.
I'm going to try to fix table output for dadi so that it's like the summary table but for each population. Done! wrote a function to do this, which I think should work well. Hmm for some reason my function to extract dadi information does not return all fo the models to put into the table...this has to do with the switch to the median. This is because when there are even numbers the median is in between two so neither is chosen. How should I deal with this? I fixed it by choosing the model with the minimum difference between its log likelihood and the median. This is simply to choose which model is used for model comparisons.
# 9 June 2020
Today's goals:
1. Fix optM stuff and update text around choosing number of migration edges in supplement.
2. Include the maximum likelihood tree with 1 migration edge in the main text figure instead of the old 2 edge one.
3. Update fourpop and threepop analyses.
4. If I have time, add text about dadi to results.
5. check up on dadi results & move any that have finished.
So the Evanno method plot was throwing a fit because of an odd if statement and lack of dev.off() issue. I can modify the function...
```{r}
my_plot_optm<-function (input, method = "Evanno", plot = TRUE, pdf = NULL)
{
if (missing(method)) {
method = "Evanno"
}
else method = method
if (!is.character(method) | length(method) > 1)
stop("The 'method' argument was not set correctly\n")
methods = c("SiZer", "linear", "Evanno")
if (!(method %in% methods))
stop("Could not find the selected 'method'. Please check.\n")
if (is.null(pdf)) {
message("No output file will be saved. To save an output file, run with 'pdf = \"file.pdf\"'\n")
}
else {
if (length(pdf) != 1 | !is.character(pdf))
stop("Output pdf file is incorrectly specified.\n")
pdf = pdf
}
if (!is.logical(plot))
stop("Please set 'plot' as either TRUE or FALSE.\n")
if (!plot & is.null(pdf))
stop(" You want neither a plot opened or an input file saved. No reason to continue. Exiting.\n")
if (method == "Evanno") {
message(paste0("Plotting the treemix results using the ",
method, " method.\n"))
if (is.null(input) | !is.data.frame(input))
stop("Proper input data frame was not detected.\n")
c = ncol(input)
if (c != 17)
warning(paste0("Warning: This function expects 17 columns in the table but detected ",
c, " columns. Proceeding anyways even if this is incorrect!\n"))
m = max(input$m, na.rm = T)
low = min(input$m, na.rm = T)
runs = mean(input$runs[2:length(input$runs)])
if (ceiling(runs) != runs) {
warning(paste0("The mean number of runs detected is ",
runs, ", but this must be a whole number. Rounding up and continuing anyway.\n"))
runs = round(runs)
}
if (!plot) {
pdf(pdf, width = 7, height = 7)
graphics::par(mfrow = c(2, 1), mar = c(4.1, 4.1,
1.1, 5.1), mgp = c(3, 1, 0))
plot(input$m, input$"mean(Lm)", pch = 1, axes = F,
ann = F)
graphics::axis(2, las = 1)
graphics::axis(1)
graphics::box()
graphics::segments(input$m, input$"mean(Lm)" -
input$"sd(Lm)", input$m, input$"mean(Lm)" +
input$"sd(Lm)")
epsilon = 0.1
graphics::segments(input$m - epsilon, input$"mean(Lm)" -
input$"sd(Lm)", input$m + epsilon, input$"mean(Lm)" -
input$"sd(Lm)")
graphics::segments(input$m - epsilon, input$"mean(Lm)" +
input$"sd(Lm)", input$m + epsilon, input$"mean(Lm)" +
input$"sd(Lm)")
graphics::title(ylab = "Mean L(m) +/- SD")
f.means = input$"mean(f)"
f.sd = input$"sd(f)"
graphics::par(new = T)
plot(input$m, f.means, pch = 19, col = grDevices::rgb(255/255,
0, 0, 89.25/255), axes = F, ann = F,ylim=c(0,1))
graphics::axis(4, las = 1)
graphics::mtext("Variance Explained", side = 4,
line = 3.5)
y.limits = graphics::par("usr")[3:4]
print(y.limits)
if ((y.limits[1]) < 0.998 && (y.limits[2] > 0.998)) {
graphics::abline(h = 0.998, col = "black",
lty = "dotted")
}
else {
warning("Horizontal line at 99.8% variation cutoff is out of bounds. This is not a big deal and the program is continuing anyway without plotting the line.\n",
immediate. = T)
}
graphics::segments(input$m, f.means - f.sd, input$m,
f.means + f.sd, col = "red")
epsilon = 0.1
graphics::segments(input$m - epsilon, f.means - f.sd,
input$m + epsilon, f.means - f.sd, col = "red")
graphics::segments(input$m - epsilon, f.means + f.sd,
input$m + epsilon, f.means + f.sd, col = "red")
graphics::legend("bottomright", legend = c("likelihoods +/- SD",
"% variance", "99.8% threshold"),
col = c("black", grDevices::rgb(255/255,
0, 0, 89.25/255), "black"), bty = "n",
pch = c(1, 19, NA), lty = c(NA, NA, "dotted"))
plot(input$m, input$Deltam, col = "blue", pch = 19,
xlab = "m", ylab = expression(italic(paste(symbol(Delta),
"m"))))
graphics::points(input$m, input$Deltam, col = "blue",
type = "l")
grDevices::dev.off()
}
else {
grDevices::dev.new(width = 7, height = 7)
graphics::par(mfrow = c(2, 1), mar = c(4.1, 4.1,
1.1, 5.1), mgp = c(3, 1, 0))
plot(input$m, input$"mean(Lm)", pch = 1, axes = F,
ann = F)
graphics::axis(2, las = 1)
graphics::axis(1)
graphics::box()
graphics::segments(input$m, input$"mean(Lm)" -
input$"sd(Lm)", input$m, input$"mean(Lm)" +
input$"sd(Lm)")
epsilon = 0.1
graphics::segments(input$m - epsilon, input$"mean(Lm)" -
input$"sd(Lm)", input$m + epsilon, input$"mean(Lm)" -
input$"sd(Lm)")
graphics::segments(input$m - epsilon, input$"mean(Lm)" +
input$"sd(Lm)", input$m + epsilon, input$"mean(Lm)" +
input$"sd(Lm)")
graphics::title(ylab = "Mean L(m) +/- SD")
f.means = input$"mean(f)"
f.sd = input$"sd(f)"
graphics::par(new = T)
plot(input$m, f.means, pch = 19, col = grDevices::rgb(255/255,
0, 0, 89.25/255), axes = F, ann = F,ylim=c(0,1))
graphics::axis(4, las = 1)
graphics::mtext("Variance Explained", side = 4,
line = 3.5)
y.limits = graphics::par("usr")[3:4]
print(y.limits)
if ((y.limits[1]) < 0.998 && (y.limits[2] > 0.998)) {
graphics::abline(h = 0.998, col = "black",
lty = "dotted")
}
else {
warning("Horizontal line at 99.8% variation cutoff is out of bounds. This is not a big deal and the program is continuing anyway without plotting the line.\n",
immediate. = T)
}
graphics::segments(input$m, f.means - f.sd, input$m,
f.means + f.sd, col = "red")
epsilon = 0.1
graphics::segments(input$m - epsilon, f.means - f.sd,
input$m + epsilon, f.means - f.sd, col = "red")
graphics::segments(input$m - epsilon, f.means + f.sd,
input$m + epsilon, f.means + f.sd, col = "red")
graphics::legend("bottomright", legend = c("likelihoods +/- SD",
"% variance", "99.8% threshold"),
col = c("black", grDevices::rgb(255/255,
0, 0, 89.25/255), "black"), bty = "n",
pch = c(1, 19, NA), lty = c(NA, NA, "dotted"))
plot(input$m, input$Deltam, col = "blue", pch = 19,
xlab = "m (migration edges)", ylab = expression(italic(paste(symbol(Delta),
"m"))))
graphics::points(input$m, input$Deltam, col = "blue",
type = "l")
if (!is.null(pdf)) {
grDevices::dev.copy(grDevices::pdf, file = pdf)
grDevices::dev.off()
message(paste0("Plot saved to file ", pdf,
".\n"))
}
else if (is.null(pdf))
message("No plot file has been saved.\n")
}
}
else if (method == "linear") {
message("Plotting the treemix results using various linear models.\n")
if (is.null(input) | !is.list(input) | length(input) !=
5)
stop("Proper input list was not detected.\n")
if (plot) {
x = input$PiecewiseLinear$model$model[, 2]
y = input$PiecewiseLinear$model$model[, 1]
pl = input$PiecewiseLinear
bc = input$BentCable
sim.exp = input$SimpleExponential
nl_ls = input$NonLinearLeastSquares
plot(x, y, ylab = "Log Likelihood", xlab = "m (migration edges)",
axes = F)
graphics::box()
graphics::axis(1)
graphics::axis(2, las = 1)
x.grid <- seq(min(x), max(x), length = 200)
graphics::lines(x.grid, stats::predict(pl, x.grid),
col = "darkgreen", lwd = 2)
graphics::lines(x.grid, stats::predict(bc, x.grid),
col = "orange", lwd = 2)
z = max(y) + 1
y2 = log(-y + z)
graphics::lines(x.grid, z - exp(stats::predict(sim.exp,
newdata = data.frame(x = x.grid))), col = "red",
lwd = 2)
graphics::lines(x.grid, z - exp(stats::predict(nl_ls,
newdata = data.frame(x = x.grid))), col = "blue",
lwd = 2)
graphics::legend("bottomright", legend = c("Observed data",
"Piecewise Linear", "Bent Cable",
"Simple Exponential", "Non-linear Least Squares",
"change points"), col = c("black",
"darkgreen", "orange", "red",
"blue", "black"), bty = "y",
pch = c(1, NA, NA, NA, NA, 8), lty = c(NA, 1,
1, 1, 1, NA))
cp.pl = input$out[which(rownames(input$out) == "PiecewiseLinear"),
4]
lnPD.pl = stats::predict(pl, cp.pl)
cp.bc = input$out[which(rownames(input$out) == "BentCable"),
4]
lnPD.bc = stats::predict(bc, cp.bc)
cp.simexp = input$out[which(rownames(input$out) ==
"SimpleExponential"), 4]
lnPD.simexp = z - exp(stats::predict(sim.exp, newdata = data.frame(x = cp.simexp)))
cp.nlls = input$out[which(rownames(input$out) ==
"NonLinearLeastSquares"), 4]
lnPD.nlls = z - exp(stats::predict(nl_ls, newdata = data.frame(x = cp.nlls)))
graphics::points(x = c(cp.pl, cp.bc, cp.simexp, cp.nlls),
y = c(lnPD.pl, lnPD.bc, lnPD.simexp, lnPD.nlls),
pch = 8, col = c("darkgreen", "orange",
"red", "blue"), cex = 1.5)
if (!is.null(pdf)) {
grDevices::dev.copy(grDevices::pdf, file = pdf)
grDevices::dev.off()
message(paste0("Plot saved to file ", pdf,
".\n"))
}
else if (is.null(pdf))
message("No plot file has been saved.\n")
}
else if (!plot) {
x = input$PiecewiseLinear$model$model[, 2]
y = input$PiecewiseLinear$model$model[, 1]
pl = input$PiecewiseLinear
bc = input$BentCable
sim.exp = input$SimpleExponential
nl_ls = input$NonLinearLeastSquares
pdf(pdf, width = 7, height = 7)
plot(x, y, ylab = "Log Likelihood", xlab = "m (migration edges)",
axes = F)
graphics::box()
graphics::axis(1)
graphics::axis(2, las = 1)
x.grid <- seq(min(x), max(x), length = 200)
graphics::lines(x.grid, stats::predict(pl, x.grid),
col = "darkgreen", lwd = 2)
graphics::lines(x.grid, stats::predict(bc, x.grid),
col = "orange", lwd = 2)
z = max(y) + 1
y2 = log(-y + z)
graphics::lines(x.grid, z - exp(stats::predict(sim.exp,
newdata = data.frame(x = x.grid))), col = "red",
lwd = 2)
graphics::lines(x.grid, z - exp(stats::predict(nl_ls,
newdata = data.frame(x = x.grid))), col = "blue",
lwd = 2)
graphics::legend("bottomright", legend = c("Observed data",
"Piecewise Linear", "Bent Cable",
"Simple Exponential", "Non-linear Least Squares",
"change points"), col = c("black",
"darkgreen", "orange", "red",
"blue", "black"), bty = "y",
pch = c(1, NA, NA, NA, NA, "X"), lty = c(NA,
1, 1, 1, 1, NA))
cp.pl = input$out[which(rownames(input$out) == "PiecewiseLinear"),
4]
lnPD.pl = stats::predict(pl, cp.pl)