# GWSS Consitutive Promoter Ortholog Search

By: Cassie Ettinger
Email: cassandra.ettinger@ucr.edu

In [1]:
#loads some basic os/ipython functionality
from os import chdir, mkdir
from os.path import join
from IPython.display import FileLinks, FileLink
from Bio import Phylo

## Data processing

Get orthologs from flybase / white fly genomes (bga)

In [None]:
# First - turn multiline fasta into single line fasta file
!awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}'
< data/promoter_prot.fasta > data/promoter_prot.fixed.fasta

In [None]:
#split genes into seperate fasta files
for gene in $(cat genes.txt);
do grep -A 1 '>'$gene data/promoter_prot.fixed.fasta | sed '/^--$/d' > $gene'.fasta'; 
done

## Run phmmer to identify ortholog candidates

In [None]:
for gene in $(cat genes.txt);
do phmmer --tblout $gene'phmmer_out_table' -o $gene'_phmmer_out' $gene'.fasta' data/Homalodisca_vitripennis_A6A7A9_masurca_v1_ragtag_v1.proteins.fixed.fa;
done                   

In [None]:
#clean up
for gene in $(cat genes.txt);
do mkdir $gene'_results';
done

In [None]:
for gene in $(cat genes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

## Look at phmmer results

In [2]:
!cat PolyubiquitinA_results/PolyubiquitinAphmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_108590-T1    -          PolyubiquitinA       -                  0 2728.0  87.3  2.1e-140  467.4   7.8   4.0   1   1   5   6   6   6   6 HOVITMR_108590
HOVITMR_108371-T1    -          PolyubiquitinA       -                  0 1369.3  40.3  1.7e-140  467.7   7.7   3.0   1   1   2   3   3   3   3 HOVITMR_108371
HOVITMR_038328-T1    -          PolyubiquitinA       -            7.2e-12   47.6   0.0   1.4e-11   46.7   0.0   1.3   1   0   0   1   1   1   1 HOVITMR_038328
HOVITMR_013968-T1    -          Polyubiqu

HOVITMR_108590, HOVITMR_108371

In [3]:
!cat Actin1_results/Actin1phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_029566-T1    -          Actin1               -           1.2e-256  852.9   0.1  1.3e-256  852.7   0.1   1.0   1   0   0   1   1   1   1 HOVITMR_029566
HOVITMR_054039-T1    -          Actin1               -           5.5e-255  847.4   0.0  6.1e-255  847.3   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_054039
HOVITMR_054038-T1    -          Actin1               -           5.2e-254  844.2   0.0  5.8e-254  844.0   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_054038
HOVITMR_045793-T1    -          Actin1   

try HOVITMR_029566 HOVITMR_054039 HOVITMR_054038 HOVITMR_045793

In [4]:
!cat Actin2_results/Actin2phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_054039-T1    -          Actin2               -           8.2e-255  846.8   0.0  9.2e-255  846.7   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_054039
HOVITMR_029566-T1    -          Actin2               -           4.1e-254  844.5   0.0  4.6e-254  844.4   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_029566
HOVITMR_054038-T1    -          Actin2               -           3.9e-253  841.3   0.0  4.3e-253  841.2   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_054038
HOVITMR_045793-T1    -          Actin2   

same hits HOVITMR_029566 HOVITMR_054039 HOVITMR_054038 HOVITMR_045793 - should combine Actin1 & Actin2 + hits to see

In [5]:
!cat CandidateList.txt

PolyubiquitinA	HOVITMR_108590
PolyubiquitinA	HOVITMR_108371
Actin1	HOVITMR_029566 
Actin1	HOVITMR_054039 
Actin1	HOVITMR_054038 
Actin1	HOVITMR_045793
Actin2	HOVITMR_029566
Actin2	HOVITMR_054039
Actin2	HOVITMR_054038
Actin2	HOVITMR_045793

## Protein alignments against reference

In [None]:
#split by gene name
for gene in $(cat genes.txt);
do grep $gene CandidateList.txt | cut -f 2 > $gene'.hits.txt';
done

In [None]:
#get sequences for hits
for gene in $(cat genes.txt);
    for hit in $(cat $gene'.hits.txt');
    do grep -A 1 '>'$hit data/Homalodisca_vitripennis_A6A7A9_masurca_v1_ragtag_v1.proteins.fixed.fa | sed '/^--$/d' >> $gene'.hits.fasta';
done

In [None]:
#combine hits and genes into single fasta
for gene in $(cat genes.txt);
do cat $gene'_results'/$gene'.fasta' $gene'.hits.fasta' > $gene'.aln.fasta'; 
done

In [None]:
#align
for gene in $(cat genes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln' -clw;
done

In [None]:
#output for trees
for gene in $(cat genes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln.tre.fasta';
done

In [None]:
#make protein trees
for gene in $(cat genes.txt);
do FastTree $gene'.aln.tre.fasta' > $gene'.aln.tre';
done

In [None]:
#clean up output
for gene in $(cat genes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

## Look at results

### PolyubiquitinA

In [6]:
!cat PolyubiquitinA_results/PolyubiquitinA.aln

MUSCLE (3.8) multiple sequence alignment


PolyubiquitinA         ------------------------------------------------------------
HOVITMR_108590-T1      MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN
HOVITMR_108371-T1      ------------------------------------------------------------
                                                                                   

PolyubiquitinA         ------------------------------------------------------------
HOVITMR_108590-T1      IQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLI
HOVITMR_108371-T1      ------------------------------------------------------------
                                                                                   

PolyubiquitinA         ------------------------------------------------------------
HOVITMR_108590-T1      FAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKA
HOVITMR_108371-T1      ------------------------------------------------------------
               

Both look like polyubiqA

In [7]:
!cat Actin1_results/Actin1.aln

MUSCLE (3.8) multiple sequence alignment


HOVITMR_045793-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
HOVITMR_054038-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054039-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                 MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITMR_029566-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                   ****************

HOVITMR_045793-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITMR_054038-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITMR_054039-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
Actin1                 SKRGVLTVKYPIEHGIVSNWDDMEKIWHHTFYNELRIAPEEHPVLLTEAPLNPKANREKM
HOVITMR_029566-T1      SKRGILTVKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
                 

In [8]:
!cat Actin2_results/Actin2.aln

MUSCLE (3.8) multiple sequence alignment


Actin2                 MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_045793-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
HOVITMR_054038-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054039-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_029566-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                   ****************

Actin2                 SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITMR_045793-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITMR_054038-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITMR_054039-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITMR_029566-T1      SKRGILTVKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
                 

Combine Actin1 & 2 and re-do alignment

In [9]:
!cat Actin1_results/Actin1.fasta Actin1_results/Actin1.hits.fasta Actin2_results/Actin2.fasta > Actin.combined.fasta

In [None]:
#align
muscle -in Actin.combined.fasta -out Actin.combined.aln -clw

#align and get tree
muscle -in Actin.combined.fasta -out Actin.combined.aln.tre.fasta
FastTree Actin.combined.aln.tre.fasta > Actin.combined.aln.tre

#clean up
mkdir Actin.combined.results
mv Actin.combined* Actin.combined.results/

In [10]:
#look at combined results
!cat Actin.combined.results/Actin.combined.aln

MUSCLE (3.8) multiple sequence alignment


HOVITMR_045793-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
Actin2                 MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054038-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054039-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                 MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITMR_029566-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                   ****************

HOVITMR_045793-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
Actin2                 SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITMR_054038-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITMR_054039-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
Actin1           

In [11]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
Phylo.draw_ascii(tree)

                                                      , HOVITMR_045793-T1
  ____________________________________________________|
 |                                                    | Actin2
 |
_|_____ HOVITMR_054039-T1
 |
 |     ____________ HOVITMR_054038-T1
 |____|
      |                        _____________________________ Actin1
      |_______________________|
                              |__________ HOVITMR_029566-T1



In [12]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
#root to Actin1
tree.root_with_outgroup({"name": "Actin1"}) 
Phylo.draw_ascii(tree)

                                                           , HOVITMR_045793-T1
                                ___________________________|
                             __|                           | Actin2
                            |  |
                 ___________|  |__ HOVITMR_054039-T1
                |           |
  ______________|           |_____ HOVITMR_054038-T1
 |              |
_|              |_____ HOVITMR_029566-T1
 |
 | Actin1



In [13]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
#root to Actin2 
tree.root_with_outgroup({"name": "Actin2"}) 
Phylo.draw_ascii(tree)

                              __ HOVITMR_054039-T1
  ___________________________|
 |                           |   _____ HOVITMR_054038-T1
 |                           |__|
 |                              |            _______________ Actin1
 |                              |___________|
 |                                          |_____ HOVITMR_029566-T1
_|
 | HOVITMR_045793-T1
 |
 | Actin2



I wonder if there is an Actin 3 / 4??? 
Going to check white fly database and if so download

In [14]:
#combine with alt actins
!cat Actin.combined.results/Actin.combined.fasta data/whitefly_other_Actin.fasta > Actin.combined.results/Actin.all.fasta

In [None]:
#align
muscle -in Actin.combined.results/Actin.all.fasta -out Actin.combined.results/Actin.all.aln -clw

#align and get tree
muscle -in Actin.combined.results/Actin.all.fasta -out Actin.combined.results/Actin.all.aln.tre.fasta
FastTree Actin.combined.results/Actin.all.aln.tre.fasta > Actin.combined.results/Actin.all.aln.tre

In [15]:
!cat Actin.combined.results/Actin.all.aln

MUSCLE (3.8) multiple sequence alignment


Actin_Bta10561         -MTGRLPA---CVIDVGTGYTKLGFACNKEPQFIIPSAIAIKETAKVGEQSIRRLTKGVE
Actin_Bta01470         MEAYDVIANQPVVIDNGSGVIKAGFAGDQVPKCCFPNY--------VGRPKHVRVMAGAL
HOVITMR_045793-T1      -------------------------------------------------------MVGMG
Actin2                 MCDDDVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
HOVITMR_054038-T1      MCDDEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
HOVITMR_054039-T1      MCDEEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
Actin4                 MCDEEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
Actin1                 MCDDELAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRYQGIMVGMG
HOVITMR_029566-T1      MCDDDVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRYQGIMVGMG
                                                                                *  

Actin_Bta10561         DLDFYIGDEATNAKG-YAVKYPVRHGLVEDWDLMEKFLEQCI-FKYLRADPEDHYFLLTE
Actin_Bta01470   

In [16]:
tree = Phylo.read("Actin.combined.results/Actin.all.aln.tre", "newick")
Phylo.draw_ascii(tree)

              _____________________________________________ Actin_Bta10561
  ___________|
 |           |______________________ Actin_Bta01470
 |
 , HOVITMR_045793-T1
 |
_| Actin2
 |
 , HOVITMR_054038-T1
 |
 |, Actin1
 ||
 || HOVITMR_029566-T1
 |
 , HOVITMR_054039-T1
 |
 | Actin4



In [17]:
#just including Actin4 
#combine with alt actins
!cat Actin.combined.results/Actin.combined.fasta data/Actin4.fasta > Actin.combined.results/Actin.with4.fasta

In [None]:
#align
muscle -in Actin.combined.results/Actin.with4.fasta -out Actin.combined.results/Actin.with4.aln -clw

#align and get tree
muscle -in Actin.combined.results/Actin.with4.fasta -out Actin.combined.results/Actin.with4.aln.tre.fasta
FastTree Actin.combined.results/Actin.with4.aln.tre.fasta > Actin.combined.results/Actin.with4.aln.tre

In [18]:
!cat Actin.combined.results/Actin.with4.aln

MUSCLE (3.8) multiple sequence alignment


HOVITMR_045793-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
Actin2                 MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054038-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITMR_054039-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin4                 MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                 MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITMR_029566-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                   ****************

HOVITMR_045793-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
Actin2                 SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITMR_054038-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITMR_054039-T1

In [19]:
tree = Phylo.read("Actin.combined.results/Actin.with4.aln.tre", "newick")
Phylo.draw_ascii(tree)

 , HOVITMR_054039-T1
 |
 | Actin4
_|
 |                                                    , HOVITMR_045793-T1
 |     _______________________________________________|
 |    |                                               | Actin2
 |____|
      |     ___________ HOVITMR_054038-T1
      |____|
           |                      __________________________ Actin1
           |_____________________|
                                 |_________ HOVITMR_029566-T1



In [20]:
tree = Phylo.read("Actin.combined.results/Actin.with4.aln.tre", "newick")
#root to Actin2 
tree.root_with_outgroup({"name": "Actin2"}) 
Phylo.draw_ascii(tree)

                                , HOVITMR_054039-T1
                              __|
                             |  | Actin4
  ___________________________|
 |                           |   _____ HOVITMR_054038-T1
 |                           |__|
 |                              |            _______________ Actin1
 |                              |___________|
 |                                          |_____ HOVITMR_029566-T1
_|
 | HOVITMR_045793-T1
 |
 | Actin2



Actin2 = HOVITMR_045793
Actin4 = HOVITMR_054039
Actin1 = HOVITMR_029566

And possible putative Actin HOVITMR_054038

Need to reverse search uniprot or similar to confirm, may provide some info about that putative one since didn't match any from white fly

### PolyUbiquitinA with additonal genes from white fly


In [None]:
!awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}'
< data/Poly.fasta > data/Poly.fixed.fasta

In [None]:
phmmer --tblout PolyubiquitinA_all_phmmer_out_table -o PolyubiquitinA_all_phmmer_out data/Poly.fixed.fasta data/Homalodisca_vitripennis_A6A7A9_masurca_v1_ragtag_v1.proteins.fixed.fa          

In [2]:
mkdir PolyubiquitinA_all_results
mv PolyubiquitinA_all* PolyubiquitinA_all_results

SyntaxError: invalid syntax (<ipython-input-2-96d0e9e9eab3>, line 1)

In [21]:
!cat PolyubiquitinA_all_results/PolyubiquitinA_all_phmmer_out_table

#                                                                  --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name              accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ----------    -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_108590-T1    -          PolyubiquitinA_Bta02851 -                  0 2728.0  87.3  2.1e-140  467.4   7.8   4.0   1   1   5   6   6   6   6 HOVITMR_108590
HOVITMR_108371-T1    -          PolyubiquitinA_Bta02851 -                  0 1369.3  40.3  1.7e-140  467.7   7.7   3.0   1   1   2   3   3   3   3 HOVITMR_108371
HOVITMR_038328-T1    -          PolyubiquitinA_Bta02851 -            7.2e-12   47.6   0.0   1.4e-11   46.7   0.0   1.3   1   0   0   1   1   1   1 HOVITMR_038328
HOVITMR_013968-T1    - 

In [22]:
#combine with prev polyubiq
!cat data/Poly.fixed.fasta PolyubiquitinA_results/PolyubiquitinA.hits.fasta > PolyubiquitinA_all_results/PolyubiquitinA_all.fasta

In [None]:
#align
muscle -in PolyubiquitinA_all_results/PolyubiquitinA_all.fasta -out PolyubiquitinA_all_results/PolyubiquitinA_all.aln -clw

#align and get tree
muscle -in PolyubiquitinA_all_results/PolyubiquitinA_all.fasta -out PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre.fasta
FastTree PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre.fasta > PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre

In [23]:
!cat PolyubiquitinA_all_results/PolyubiquitinA_all.aln

MUSCLE (3.8) multiple sequence alignment


PolyubiquitinA_FBpp0073035      ------------------------------------------------------------
HOVITMR_108371-T1               ------------------------------------------------------------
HOVITMR_108590-T1               ------------------------------------------------------------
PolyubiquitinA_Bta02851         ------------------------------------------------------------
PolyubiquitinA_Bta02020         MQIFVKTLTGKTITLEVEASDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN
PolyubiquitinA_Ssa02069         ------------------------------------------------------------
                                                                                            

PolyubiquitinA_FBpp0073035      ------------------------------------------------------------
HOVITMR_108371-T1               ------------------------------------------------------------
HOVITMR_108590-T1               ------------------------------------------------------------
Polyubiquiti

In [24]:
tree = Phylo.read("PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre", "newick")
Phylo.draw_ascii(tree)

 , HOVITMR_108371-T1
 |
_| HOVITMR_108590-T1
 |
 |     , PolyubiquitinA_FBpp0073035
 |_____|
       |                                 __________ PolyubiquitinA_Bta02851
       |________________________________|
                                        | _ PolyubiquitinA_Bta02020
                                        ||
                                         |_ PolyubiquitinA_Ssa02069



# Additional promoters 

In [None]:
for gene in $(cat moregenes.txt);
do phmmer --tblout $gene'_phmmer_out_table' -o $gene'_phmmer_out' data/$gene'.fasta' data/Homalodisca_vitripennis_A6A7A9_masurca_v1_ragtag_v1.proteins.fixed.fa;
done  

In [None]:
#clean up
for gene in $(cat moregenes.txt);
do mkdir $gene'_results';
done

In [None]:
for gene in $(cat moregenes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

In [25]:
!cat exu_results/exu_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_010109-T1    -          exu                  -            3.6e-82  279.3   1.1   4.9e-82  278.9   1.1   1.0   1   0   0   1   1   1   1 HOVITMR_010109
HOVITMR_074143-T1    -          exu                  -               0.53   12.2   0.0       5.9    8.7   0.0   1.9   1   1   1   2   2   2   0 HOVITMR_074143
HOVITMR_016445-T1    -          exu                  -                1.4   10.7   3.5        52    5.6   0.2   2.4   2   0   0   2   2   2   0 HOVITMR_016445
HOVITMR_010109-T1    -          exu      

HOVITMR_010109

In [26]:
!cat vasa_results/vasa_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_068568-T1    -          vasa                 -              2e-89  303.4   0.0     2e-89  303.4   0.0   2.0   2   0   0   2   2   1   1 HOVITMR_068568
HOVITMR_068569-T1    -          vasa                 -            5.1e-89  302.0   0.0   6.9e-89  301.6   0.0   1.1   1   0   0   1   1   1   1 HOVITMR_068569
HOVITMR_025791-T1    -          vasa                 -            2.4e-88  299.8   0.0   3.4e-88  299.3   0.0   1.1   1   0   0   1   1   1   1 HOVITMR_025791
HOVITMR_063548-T1    -          vasa     

HOVITMR_068568 HOVITMR_068569 HOVITMR_025791 HOVITMR_063548 HOVITMR_020496 HOVITMR_044401 HOVITMR_006672 HOVITMR_020497

In [28]:
!cat tubulin_results/tubulin_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITMR_005648-T1    -          tubulin              -           2.2e-263  876.0   0.1    3e-263  875.6   0.1   1.0   1   0   0   1   1   1   1 HOVITMR_005648
HOVITMR_064570-T1    -          tubulin              -           2.6e-254  846.1   0.0  3.2e-254  845.8   0.0   1.0   1   0   0   1   1   1   1 HOVITMR_064570
HOVITMR_048822-T1    -          tubulin              -           2.5e-249  829.7   0.1  3.1e-249  829.4   0.1   1.0   1   0   0   1   1   1   1 HOVITMR_048822
HOVITMR_027853-T1    -          tubulin  

HOVITMR_005648 HOVITMR_064570 HOVITMR_048822 HOVITMR_027853 HOVITMR_031071 HOVITMR_077363 HOVITMR_073055

In [29]:
!cat CandidateList_moregenes.txt

exu	HOVITMR_010109
vasa	HOVITMR_068568 
vasa	HOVITMR_068569 
vasa	HOVITMR_025791 
vasa	HOVITMR_063548 
vasa	HOVITMR_020496 
vasa	HOVITMR_044401 
vasa	HOVITMR_006672
vasa	HOVITMR_020497
tubulin	HOVITMR_005648 
tubulin	HOVITMR_064570 
tubulin	HOVITMR_048822 
tubulin	HOVITMR_027853 
tubulin	HOVITMR_031071 
tubulin	HOVITMR_077363 
tubulin	HOVITMR_073055


In [None]:
#split by gene name
for gene in $(cat moregenes.txt);
do grep $gene CandidateList_moregenes.txt | cut -f 2 > $gene'.hits.txt';
done

In [None]:
#get sequences for hits
for gene in $(cat moregenes.txt);
    for hit in $(cat $gene'.hits.txt');
    do grep -A 1 '>'$hit data/Homalodisca_vitripennis_A6A7A9_masurca_v1_ragtag_v1.proteins.fixed.fa | sed '/^--$/d' >> $gene'.hits.fasta';
done

In [None]:
#combine hits and genes into single fasta
for gene in $(cat moregenes.txt);
do cat data/$gene'.fasta' $gene'.hits.fasta' > $gene'.aln.fasta'; 
done

In [None]:
#align
for gene in $(cat moregenes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln' -clw;
done

In [None]:
#output for trees
for gene in $(cat moregenes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln.tre.fasta';
done

In [None]:
#make protein trees
for gene in $(cat moregenes.txt);
do FastTree $gene'.aln.tre.fasta' > $gene'.aln.tre';
done

In [None]:
#clean up output
for gene in $(cat moregenes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

In [30]:
!cat exu_results/exu.aln

MUSCLE (3.8) multiple sequence alignment


exu_FBpp0085555         MVADNIDAGVAIAVADQSSSPVGDKVELPA-----GNYILVGVDIDTTGRRLMDEIVQLA
exu_ACYPI005096-PA      MV---------------QNTEIVDVSNNTVTLDNLSDYMVIGWDIDTTGRRLLDEICHIA
HOVITMR_010109-T1       MV---------------STDKVVPKAGLPA-----GDYKIVAWDLDTTGRRLIDEFCHVA
                        **               ..  :      ..     .:* ::. *:*******:**: ::*

exu_FBpp0085555         AYTPTDHFEQYIMPYMNLNPAARQRHQVRVISIGFYRMLKSMQTYKIIKSKSEIAALKDF
exu_ACYPI005096-PA      GHTPNSQFNQYIMPHNDIDQVARRRHLLCTITMGRFRALKDIKNNKTLKSKSEISALAEF
HOVITMR_010109-T1       GYTPEDKFSQYVMPYKDLDLISKRRHQVRTVTVGKYRMLKDLKTGKFLKTKSEISALTDF
                        .:** .:*.**:**: :::  :..** : .:::* :* **.::. * :*:****:** :*

exu_FBpp0085555         LNWLEQLKTKAGPSSDGIVLIYHEERKFIPYMILESLKKYGLLERFTASVKSFANSINLA
exu_ACYPI005096-PA      IEWLEQMLKDNGKKMA--ILVCHEVSKFNTCLLIKSLLAYNLLDKFSEVVKGFANCHSFA
HOVITMR_010109-T1       LTWLEKVK---GDSKSGIILLNFESFKLAPSLLLEALKKYQLLDRFTNVVKGFSDCYAYV
    

In [31]:
tree = Phylo.read("exu_results/exu.aln.tre", "newick")
#root to fly copy 
tree.root_with_outgroup({"name": "exu_FBpp0085555"}) 
Phylo.draw_ascii(tree)

                                  ________________________ exu_ACYPI005096-PA
  _______________________________|
_|                               |__________________________ HOVITMR_010109-T1
 |
 | exu_FBpp0085555



In [32]:
!cat vasa_results/vasa.aln

MUSCLE (3.8) multiple sequence alignment


HOVITMR_025791-T1      MSSNRGGDNKPKGFGFSGFQVKSRSERANALPPPPNSALSKQGYS---------------
HOVITMR_063548-T1      MQHSHRESRTSANRILGIFQYNCRGKCQVLKRSCSPLVAAKHWMQALPASFSPLPAILCC
HOVITMR_006672-T1      ------------------------------------------------------------
HOVITMR_044401-T1      -------------------------------------MTSANLER---------------
HOVITMR_068568-T1      ------------------------------------------------------------
HOVITMR_068569-T1      ------------------------------------------------------------
HOVITMR_020497-T1      ------------------------------------------------------------
vasa_FBpp0401446       --------------------------------------MSDDWDD---------------
vasa_FBpp0401447       --------------------------------------MSDDWDD---------------
HOVITMR_020496-T1      ------------------------------------------------------------
                                                                                   

HOVITMR_025791-T1

In [33]:
tree = Phylo.read("vasa_results/vasa.aln.tre", "newick")
Phylo.draw_ascii(tree)

  ___________________________________________ HOVITM_011853-T1
 |
 |______________________________ HOVITM_021498-T1
 |
_|                    ___________________________________ HOVITM_065671-T1
 |          _________|
 |         |         |________________________ HOVITM_011049-T1
 |         |
 |_________|                                              , vasa_FBpp0401446
           |                             _________________|
           |      ______________________|                 | vasa_FBpp0401447
           |     |                      |
           |_____|                      |__________________ HOVITM_092677-T1
                 |
                 |                     ________ HOVITM_064625-T1
                 |____________________|
                                      |___________ HOVITM_064626-T1



Maybe HOVITM_092677

In [34]:
!cat tubulin_results/tubulin.aln

MUSCLE (3.8) multiple sequence alignment


tubuluin_tub2_FBpp0076678           -----------------------------------------MREIVTLQIGGAG-----N
HOVITM_075081-T1                    ----------------------------------------MMTCIIVIKV----------
HOVITM_094615-T1                    -----------------------------------MSLPVSPCPQCTCRTTPCSPRPDDA
HOVITM_099791-T1                    ------------------------------------------------------------
tubulin_beta97ef_FBpp0289838        -----------------------------------------MREIVHLQAGQCG-----N
tubulin_beta97ef_CLEC007800         -----------------------------------------MREIVHLQAGQCG-----N
HOVITM_075967-T1                    MNHTQSDFYGQFYWGPTSGVTRQSGASSPAACCNAHFLSAPLTPSLHLSIPTRGWW---N
tubulin_beta60d_ACYPI001007-PA      -----------------------------------------MREIVHLQAGQCG-----N
HOVITM_049113-T1                    ------------------------------------------------------------
tubulin_beta60d_FBpp0072177         ------------------------------------

In [37]:
tree = Phylo.read("tubulin_results/tubulin.aln.tre", "newick")
#root to fly copy 
tree.root_with_outgroup({"name": "tubuluin_tub2_FBpp0076678"}) 
Phylo.draw_ascii(tree)

                                , tubulin_beta85d_AAEL002851
                                |
                                , tubulin_beta56d2_FBpp0085720
                                |
                                |_ tubulin_beta56d_FBpp0085721
                               ,|
                               || HOVITM_074970-T1
                               ||
                               || tubulin_beta56d_ACYPI008874-PA
                               |
                               | ___ HOVITM_075967-T1
                              ,||
                              ||| tubulin_beta85d_FBpp0081524
                              ||
                              || __ tubulin_beta97ef_CLEC007800
                              |||
                              | | __ tubulin_beta97ef_FBpp0289838
                              | ||
                              |  |___________ HOVITM_094615-T1
                              |
                             _|_____ HOVITM_099791-

HOVITM_049113 = 60d
HOVITM_075967 = 85d
HOVITM094615 = beta97ef 
HOVITM_074970 = 56d
HOVITM_075081 = tub2, maybe 

## Problem - vasa, 2 genes?

In [None]:
#align protiens

muscle -in vasa_nucl_results/vasa_nucl.fasta -out vasa_nucl_results/vasa_nucl.aln -clw
muscle -in vasa_nucl_results/vasa_nucl.fasta -out vasa_nucl_results/vasa_nucl.aln.tre.fasta
FastTree vasa_nucl_results/vasa_nucl.aln.tre.fasta > vasa_nucl_results/vasa_nucl.aln.tre


In [1]:
!cat vasa_nucl_results/vasa_nucl.aln

MUSCLE (3.8) multiple sequence alignment


white_FBpp0070468      ATGGGCCAAGAGGATCAGGAGCTATTAATTCGCGGAGGCAGCAAACACCCATCTGCCGAG
HOVITMR_020497         ATGGTT--AGTGTATCAGGCAATAATGTTCCGGA----------------ACCTATCAGA
HOVITMR_020496         ATG---------------------------------------------------------
glimmerG_53977         ATG--------GTATCAGGCAATAATGTTCCGGA----------------ACCTATCAGA
                       ***                                                         

white_FBpp0070468      CATCTGAACAATGGTGACAGCGGAGCGGCTTCGCAGAGCTGCATTAACCAGGGCTTCGGG
HOVITMR_020497         GATTTTAATAGTGCT---------------------------------------------
HOVITMR_020496         ------------------------------------------------------------
glimmerG_53977         GATTTTAATAGTGCT---------------------------------------------
                                                                                   

white_FBpp0070468      CAGGCCAAAAACTACGGCACGCTCCGGCCACCCAGTCCGCCGGAGGACTCCGGTTCAGGG
HOVITMR_020497 