In [162]:
!module load R/3.5.1

In [163]:
## Installations for py2.7 environment for DelimitR
#conda install notebook -c conda-forge
#conda install mpi4py -c conda-forge
#conda install -c r rpy2
## ------------------------------------------------------------ 
## Must Load Rpy2 to use R through the python notebook
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [164]:
%%R
library(abcrf)
library(devtools)
library(delimitR)
library(randomForest)
library(neuralnet)

## need to actually install these
library(foreach)
library(doParallel)

In [165]:
%%R
setwd("/mnt/lfs2/ruff6699/Tsugaheterophylla_AllDataAnalysis/delimitR/dataset6")

In [134]:
%%R
getwd()

[1] "/mnt/lfs2/ruff6699/Tsugaheterophylla_AllDataAnalysis/delimitR/dataset6"


In [166]:
!ls

Binned_Processed_Tsuga_10_MSFS.obs    Tsuga_11.tpl
Binned_Processed_Tsuga_11_MSFS.obs    Tsuga_1.est
Binned_Processed_Tsuga_1_MSFS.obs     Tsuga_1_MSFS.obs
Binned_Processed_Tsuga_2_MSFS.obs     Tsuga_1.params
Binned_Processed_Tsuga_3_MSFS.obs     Tsuga_1.tpl
Binned_Processed_Tsuga_4_MSFS.obs     Tsuga_2
Binned_Processed_Tsuga_5_MSFS.obs     Tsuga_2.est
Binned_Processed_Tsuga_6_MSFS.obs     Tsuga_2_MSFS.obs
Binned_Processed_Tsuga_7_MSFS.obs     Tsuga_2.params
Binned_Processed_Tsuga_8_MSFS.obs     Tsuga_2.tpl
Binned_Processed_Tsuga_9_MSFS.obs     Tsuga_3
Dataset6_RFconfusionMatrix.csv	      Tsuga_3.est
Dataset6_RFconfusionMatrix_GROUP.csv  Tsuga_3_MSFS.obs
jSFS_dataset6_binned.obs	      Tsuga_3.params
jSFS_dataset6_forplotting.obs	      Tsuga_3.tpl
jSFS_dataset6_noMono4plot.obs	      Tsuga_4
jSFS_dataset6_noMono_binned.obs       Tsuga_4.est
jSFS_dataset6_noMono.obs	      Tsuga_4_MSFS.obs
jSFS_dataset6_noMono_processed.obs    Tsuga_4.params
jSFS_dataset6.obs		      Tsug

In [139]:
%%R
## Set up files and Parameters for DelimitR analysis

## observed SFS file
observedSFS <- 'jSFS_dataset6_noMono'

## file designating which sample with which population
traitsfile <- 'traits_d6.txt'

## 2 populations
observedtree <- '(0,1);'

## Allow migration between the two populations
migmatrix <- matrix(c(FALSE, TRUE, 
                    TRUE, FALSE),
                    nrow = 2, ncol = 2, byrow = TRUE)

## Allow either of these scenerios? Yes
divwgeneflow <- TRUE
seccontact <- TRUE

## Max number of migration events (can only be 1 if two populations)
maxedges <- 1

## 2 "species"
obsspecies<- 2

## number of alleles from Inland and coast
obssamplesize <- c(16,27)

## roughly, the number of SNPS. will change to a range of SNPS in future/manually
obssnps <- 1195

## Prefix for files of this analysis
obsprefix <- 'Tsuga'

In [137]:
%%R
## Also now is an opportune moment to clean our working directory of eronious file
## DO NOT DO THIS until all params files are save so you know the params you simulated under
clean_working(prefix=obsprefix)

In [None]:
%%R
## Don't ever really need to re-run because set up most of the models manually
## Have a total of 11 models

## this function generates tpl and est files for some models
setup_fsc2(tree=observedtree,
           nspec=obsspecies,
           samplesizes=obssamplesize,
           nsnps=obssnps,
           prefix=obsprefix,
           migmatrix=migmatrix,
           popsizeprior=obspopsizeprior,
           divtimeprior=obsdivtimeprior,
           migrateprior=obsmigrateprior,
           secondarycontact= seccontact,
           divwgeneflow= divwgeneflow,
           maxmigrations = maxedges)

## For the Record; this creats 4 .tpl and 4 .est files.
## Thuja_1: 2 population model of immediate coalescence by populations. Panmixia
## Thuja_2: 2 population model with ancient coalescence event. Ancient Vicariance

## needed to adjust these models to have assymmetric gene flow, so just added an additional migration parameter 
##        with a different name
## Thuja_3: 2 population model with secondary contact: Secondary contact
## Thuja_4: 2 population model of divergence with gene flow: Divergence with gene flow


In [140]:
%%R
## define a parallel version of the function fastsimcoalsims (named fastsimcoalsims_Par) ##
###########################################################################################

fastsimcoalsims_Par <- function (prefix, pathtofsc, nreps)
{   listoftpl <- list()
    listofest <- list()
    tpllist <- system(paste("ls ", prefix, "*.tpl", sep = ""),
                        intern = T)
    estlist <- system(paste("ls ", prefix, "*.est", sep = ""),
                        intern = T)
    listoftpl <- c(listoftpl, tpllist)
    listofest <- c(listofest, estlist)
    foreach(j=1:length(listoftpl)) %dopar% {
        print(paste(pathtofsc, " -t ", prefix, "_", j, ".tpl",
                " -e ", prefix, "_", j, ".est", " -n 1 --msfs -q --multiSFS -x -E",
                nreps, sep = ""))
        system(paste(pathtofsc, " -t ", prefix, "_", j, ".tpl",
                " -e ", prefix, "_", j, ".est", " -n 1 --msfs -q --multiSFS -x -E",
                nreps, sep = ""), ignore.stdout = TRUE)
    }
}
## Users must register the number of cores to use with doParallel
ncores=11
registerDoParallel(cores=ncores)



In [141]:
%%R
## Simulate the data using fastsimcoal2
fastsimcoalsims_Par(prefix=obsprefix,
                pathtofsc='../fsc26',
                nreps=10000)


[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

[[7]]
[1] 0

[[8]]
[1] 0

[[9]]
[1] 0

[[10]]
[1] 0

[[11]]
[1] 0



In [142]:
%%R
## define the number of class to bin SFS by and assemble the prior from the simulated data
nclasses <- 5


FullPrior <- makeprior(prefix=obsprefix,
                       nspec=obsspecies,
                       nclasses=nclasses,
                       getwd(),
                       traitsfile = traitsfile,
                       threshold=100, 
                       thefolder = 'Prior',
                       ncores = 40)

In [143]:
%%R
## Have a look at the prior
FullPrior

         V1  V2  V3  V4 V5  V6  V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
1       760  93  10   0  0  63  98 61  1   0  10  50  39   0   0   0   6   4
2       760  71  17   0  0  68 100 67  0   0  11  39  57   0   0   0   4   1
3       769  87  13   1  0  64 114 42  1   0   7  35  52   0   0   0   4   5
4       757  76  14   0  0  76 112 40  1   0   6  46  54   0   0   0   9   4
5       777  78   7   0  0  68 108 58  0   0   4  45  42   0   0   0   5   3
6       776  84  11   0  0  59 112 50  0   0   4  37  52   0   0   0   8   2
7       746  93  11   1  0  56 127 62  0   0   5  46  37   0   0   3   7   1
8       776  73  11   0  0  60 126 52  1   0   5  36  45   0   0   0   7   3
9       789  67   7   0  0  59 101 56  0   0   6  47  53   0   0   0   7   3
10      789  90  11   0  0  59 107 36  1   0   8  36  44   0   0   1  13   0
11      773  77  18   0  0  75 112 43  0   0   4  42  44   0   0   0   4   3
12      766  97   8   0  0  78  88 55  2   0   4  41  45   0   0   0   9   2

In [103]:
%%R
save(FullPrior, file="FullPrior_dataset6.Rdata")
#load(file="FullPrior_dataset4.Rdata")

In [144]:
%%R
##  We want to remove rows that have zero variance because these bins add nothing to the analysis
ReducedPrior <- Prior_reduced(FullPrior)
ReducedPrior 

         V1  V2  V3  V4  V6  V7 V8 V9 V11 V12 V13 V16 V17 V18  V21 V22
1       760  93  10   0  63  98 61  1  10  50  39   0   6   4    0   0
2       760  71  17   0  68 100 67  0  11  39  57   0   4   1    0   0
3       769  87  13   1  64 114 42  1   7  35  52   0   4   5    0   1
4       757  76  14   0  76 112 40  1   6  46  54   0   9   4    0   0
5       777  78   7   0  68 108 58  0   4  45  42   0   5   3    0   0
6       776  84  11   0  59 112 50  0   4  37  52   0   8   2    0   0
7       746  93  11   1  56 127 62  0   5  46  37   3   7   1    0   0
8       776  73  11   0  60 126 52  1   5  36  45   0   7   3    0   0
9       789  67   7   0  59 101 56  0   6  47  53   0   7   3    0   0
10      789  90  11   0  59 107 36  1   8  36  44   1  13   0    0   0
11      773  77  18   0  75 112 43  0   4  42  44   0   4   3    0   0
12      766  97   8   0  78  88 55  2   4  41  45   0   9   2    0   0
13      760  80   9   0  58 110 57  1   7  44  61   0   6   1    0   1
14    

In [145]:
%%R
## Construct a Random Forest classifier using reduced prior and look at error rates
myRF <- RF_build_abcrf(ReducedPrior,FullPrior,1000)
myRF

Growing trees.. Progress: 52%. Estimated remaining time: 28 seconds.

Call:
 abcrf(formula = Models ~ ., data = Trainingdata, ntree = ntrees, paral = TRUE) 
includes the axes of a preliminary LDA

Number of simulations: 110000
Out-of-bag prior error rate: 34.7791%

Confusion matrix:
              Tsuga_1_MSFS Tsuga_2_MSFS Tsuga_3_MSFS Tsuga_4_MSFS Tsuga_5_MSFS
Tsuga_1_MSFS          7458            0           18            0            0
Tsuga_2_MSFS             0         6490           62         2862            0
Tsuga_3_MSFS            29            3         7850          187            0
Tsuga_4_MSFS             0         1823          277         7381            0
Tsuga_5_MSFS             0            0            0            0         9011
Tsuga_6_MSFS             0          385            0          192          924
Tsuga_7_MSFS             0           28          313          385            0
Tsuga_8_MSFS          2616            0          777            0            0
Tsuga

In [74]:
%%R
#myRF$model.rf$confusion.matrix
#save(myRF, file="myRFobject_dataset.Rdat")
#load(file="myRFobject_dataset4.Rdat")

NULL


In [147]:
%%R
myRF


Call:
 abcrf(formula = Models ~ ., data = Trainingdata, ntree = ntrees, paral = TRUE) 
includes the axes of a preliminary LDA

Number of simulations: 110000
Out-of-bag prior error rate: 34.7791%

Confusion matrix:
              Tsuga_1_MSFS Tsuga_2_MSFS Tsuga_3_MSFS Tsuga_4_MSFS Tsuga_5_MSFS
Tsuga_1_MSFS          7458            0           18            0            0
Tsuga_2_MSFS             0         6490           62         2862            0
Tsuga_3_MSFS            29            3         7850          187            0
Tsuga_4_MSFS             0         1823          277         7381            0
Tsuga_5_MSFS             0            0            0            0         9011
Tsuga_6_MSFS             0          385            0          192          924
Tsuga_7_MSFS             0           28          313          385            0
Tsuga_8_MSFS          2616            0          777            0            0
Tsuga_9_MSFS          3087            0          596            0         

In [148]:
%%R
write.table(myRF$model.rf$confusion.matrix, file="Dataset6_RFconfusionMatrix.csv", sep="\t")

In [149]:
%%R
Models <- as.factor(FullPrior[,'Model'])
Trainingdata <- data.frame(Models, ReducedPrior)
myRF.group <- abcrf(Models~., data = Trainingdata, ntree=1000, paral = TRUE, group=list("Tsuga_1_MSFS",
                        "Tsuga_2_MSFS", "Tsuga_3_MSFS", "Tsuga_4_MSFS", "Tsuga_5_MSFS", "Tsuga_6_MSFS", 
                        "Tsuga_7_MSFS", c("Tsuga_8_MSFS","Tsuga_9_MSFS","Tsuga_10_MSFS","Tsuga_11_MSFS")))

In [150]:
%%R
myRF.group 


Call:
 abcrf(formula = Models ~ ., data = Trainingdata, group = list("Tsuga_1_MSFS", "Tsuga_2_MSFS", "Tsuga_3_MSFS", "Tsuga_4_MSFS", "Tsuga_5_MSFS", "Tsuga_6_MSFS", "Tsuga_7_MSFS", c("Tsuga_8_MSFS", "Tsuga_9_MSFS", "Tsuga_10_MSFS", "Tsuga_11_MSFS")), ntree = 1000, paral = TRUE) 
includes the axes of a preliminary LDA

Number of simulations: 110000
Out-of-bag prior error rate: 19.8755%

Confusion matrix:
     g1   g2   g3   g4   g5   g6   g7    g8 class.error
g1 5269    0    2    0    0    0    0  4729    0.473100
g2    0 6533   60 2821    0  297  289     0    0.346700
g3    3    2 6954  195    0    3  345  2498    0.304600
g4    0 1824  270 7393    0   59  444    10    0.260700
g5    0    0    0    0 9022  978    0     0    0.097800
g6    0  385    0  186  914 8456   59     0    0.154400
g7    0   35  307  374    0   39 9235    10    0.076500
g8 2992    1 1684   15    0    0   33 35275    0.118125


In [152]:
%%R
write.table(myRF.group$model.rf$confusion.matrix, file="Dataset6_RFconfusionMatrix_GROUP.csv", sep="\t")

In [120]:
%%R
#myRF.group$model.rf$confusion.matrix
save(myRF, file="myRFgroupobject_dataset2.Rdat")
#load(file="myRFgroupobject_dataset4.Rdat")

In [167]:
%%R
myRF.group


Call:
 abcrf(formula = Models ~ ., data = Trainingdata, group = list("Tsuga_1_MSFS", "Tsuga_2_MSFS", "Tsuga_3_MSFS", "Tsuga_4_MSFS", "Tsuga_5_MSFS", "Tsuga_6_MSFS", "Tsuga_7_MSFS", c("Tsuga_8_MSFS", "Tsuga_9_MSFS", "Tsuga_10_MSFS", "Tsuga_11_MSFS")), ntree = 1000, paral = TRUE) 
includes the axes of a preliminary LDA

Number of simulations: 110000
Out-of-bag prior error rate: 19.8755%

Confusion matrix:
     g1   g2   g3   g4   g5   g6   g7    g8 class.error
g1 5269    0    2    0    0    0    0  4729    0.473100
g2    0 6533   60 2821    0  297  289     0    0.346700
g3    3    2 6954  195    0    3  345  2498    0.304600
g4    0 1824  270 7393    0   59  444    10    0.260700
g5    0    0    0    0 9022  978    0     0    0.097800
g6    0  385    0  186  914 8456   59     0    0.154400
g7    0   35  307  374    0   39 9235    10    0.076500
g8 2992    1 1684   15    0    0   33 35275    0.118125


In [156]:
%%R
## must run binSFS.py
##prep obderved obs file
nclasses <- 5
observedSFS <- 'jSFS_dataset6_noMono'
traitsfile <- 'traits_d6.txt'

myobserved <- prepobserved(
  observedSFS,
  FullPrior,
  ReducedPrior,
  nclasses,
  obsspecies,
  traitsfile=traitsfile,
  threshold = 100)

In [157]:
%%R
myobserved

          V1       V2        V3        V4       V6       V7       V8        V9
1   938.3114 71.99542 11.549646 2.9496222 55.05784 36.95833 20.35965 0.4827386
2   939.1025 72.71593 15.198279 1.0050245 54.42085 34.36286 21.34412 0.5533557
3   947.7157 70.05472  9.120228 1.4596510 51.88699 38.97412 19.87204 0.1366559
4   956.0065 70.28340 10.568827 1.8702985 43.33592 30.29948 23.44893 0.4068286
5   947.6454 78.84843 13.290488 2.1885382 51.56210 31.38108 20.72729 0.3259515
6   959.2649 65.63672 13.723865 1.1182794 54.18302 34.38201 16.74719 0.2626421
7   945.1911 71.61207  9.539433 2.5900385 47.72937 36.02498 23.86894 0.3595576
8   947.8787 76.39096 11.184936 2.0282886 56.78397 38.56894 19.71968 0.1320289
9   955.2316 67.34969 11.255971 1.1085501 57.97081 31.24752 19.11904 0.4879104
10  944.9742 68.71992 10.940401 1.6170663 56.29953 32.64478 20.38686 0.2057385
11  961.4201 67.58081  9.872777 2.9198460 50.41097 32.02563 20.20705 0.2596335
12  942.7472 68.74957 12.790712 4.0505323 48.24506 3

In [158]:
%%R
prediction<-c()

In [159]:
%%R
##make prediction on data with ABC_RF from delimitR
prediction <- RF_predict_abcrf(myRF, myobserved[1:100,], ReducedPrior, FullPrior, 500)
prediction


    selected model votes model1 votes model2 votes model3 votes model4
1     Tsuga_3_MSFS            0            0          789            2
2     Tsuga_3_MSFS            0            0          819            1
3     Tsuga_3_MSFS            0            0          736            1
4     Tsuga_3_MSFS            0            0          504            1
5     Tsuga_3_MSFS            0            0          776            4
6     Tsuga_3_MSFS            0            1          496            9
7     Tsuga_3_MSFS            0            0          501            1
8     Tsuga_3_MSFS            0            0          854            4
9     Tsuga_3_MSFS            0            0          747            4
10    Tsuga_3_MSFS            0            0          762            1
11    Tsuga_3_MSFS            0            0          466            5
12   Tsuga_10_MSFS            0            0          338            2
13    Tsuga_3_MSFS            0            0          579            0
14    

In [160]:
%%R
write.table(as.matrix(prediction), sep="\t", file="PredictionOut_Dataset6.csv")

In [168]:
%%R
prediction <- RF_predict_abcrf(myRF.group, myobserved[1:100,], ReducedPrior, FullPrior, 500)
prediction


    selected group votes group1 votes group2 votes group3 votes group4
1               g3            0            0          813            3
2               g3            0            0          825            2
3               g3            0            0          803            1
4               g8            0            0          495            2
5               g3            0            0          846            4
6               g3            0            0          560           30
7               g8            0            0          437            0
8               g3            0            0          885            2
9               g3            0            0          752            6
10              g3            0            0          759            2
11              g8            0            0          458            5
12              g8            0            0          273            4
13              g3            0            0          501            5
14    

In [169]:
%%R
write.table(as.matrix(prediction), sep="\t", file="PredictionOutGroup_Dataset6.csv")