# GWSS Consitutive Promoter Ortholog Search

By: Cassie Ettinger
Email: cassandra.ettinger@ucr.edu

In [1]:
#loads some basic os/ipython functionality
from os import chdir, mkdir
from os.path import join
from IPython.display import FileLinks, FileLink
from Bio import Phylo

## Data processing

Get orthologs from flybase / white fly genomes (bga)

In [None]:
# First - turn multiline fasta into single line fasta file
!awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}'
< data/promoter_prot.fasta > data/promoter_prot.fixed.fasta

In [None]:
#split genes into seperate fasta files
for gene in $(cat genes.txt);
do grep -A 1 '>'$gene data/promoter_prot.fixed.fasta | sed '/^--$/d' > $gene'.fasta'; 
done

## Run phmmer to identify ortholog candidates

In [None]:
for gene in $(cat genes.txt);
do phmmer --tblout $gene'phmmer_out_table' -o $gene'_phmmer_out' $gene'.fasta' data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins.fixed.fa;
done                   

In [None]:
#clean up
for gene in $(cat genes.txt);
do mkdir $gene'_results';
done

In [None]:
for gene in $(cat genes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

## Look at phmmer results

In [2]:
!cat PolyubiquitinA_results/PolyubiquitinAphmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_103732-T1     -          PolyubiquitinA       -              8e-12   47.6   0.0   1.5e-11   46.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_103732
HOVITM_024119-T1     -          PolyubiquitinA       -            1.8e-08   36.6   0.0   3.3e-08   35.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_024119
HOVITM_100390-T1     -          PolyubiquitinA       -            0.00036   22.6   0.0   0.00064   21.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_100390
HOVITM_087185-T1     -          Polyubiquiti

Try HOVITM_103732, HOVITM_024119 but likely want to grab more orthologs and try this search again

In [3]:
!cat Actin1_results/Actin1phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_043299-T1     -          Actin1               -           1.3e-256  852.9   0.1  1.5e-256  852.7   0.1   1.0   1   0   0   1   1   1   1 HOVITM_043299
HOVITM_022702-T1     -          Actin1               -           6.1e-255  847.4   0.0  6.8e-255  847.3   0.0   1.0   1   0   0   1   1   1   1 HOVITM_022702
HOVITM_022703-T1     -          Actin1               -           5.8e-254  844.2   0.0  6.5e-254  844.0   0.0   1.0   1   0   0   1   1   1   1 HOVITM_022703
HOVITM_100357-T1     -          Actin1      

try HOVITM_043299, HOVITM_022702, HOVITM_022703, HOVITM_100357

In [4]:
!cat Actin2_results/Actin2phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_022702-T1     -          Actin2               -           9.2e-255  846.8   0.0    1e-254  846.7   0.0   1.0   1   0   0   1   1   1   1 HOVITM_022702
HOVITM_043299-T1     -          Actin2               -           4.6e-254  844.5   0.0  5.1e-254  844.4   0.0   1.0   1   0   0   1   1   1   1 HOVITM_043299
HOVITM_022703-T1     -          Actin2               -           4.3e-253  841.3   0.0  4.8e-253  841.2   0.0   1.0   1   0   0   1   1   1   1 HOVITM_022703
HOVITM_100357-T1     -          Actin2      

Same four hits - HOVITM_043299, HOVITM_022702, HOVITM_022703, HOVITM_100357 - should combine Actin1 & Actin2 + hits to see

In [5]:
!cat CandidateList.txt

PolyubiquitinA	HOVITM_103732
PolyubiquitinA	HOVITM_024119
Actin1	HOVITM_043299
Actin1	HOVITM_022702
Actin1	HOVITM_022703
Actin1	HOVITM_100357
Actin2	HOVITM_043299
Actin2	HOVITM_022702
Actin2	HOVITM_022703
Actin2	HOVITM_100357

## Protein alignments against reference

In [None]:
#split by gene name
for gene in $(cat genes.txt);
do grep $gene CandidateList.txt | cut -f 2 > $gene'.hits.txt';
done

In [None]:
#get sequences for hits
for gene in $(cat genes.txt);
    for hit in $(cat $gene'.hits.txt');
    do grep -A 1 '>'$hit data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins.fixed.fa | sed '/^--$/d' >> $gene'.hits.fasta';
done

In [None]:
#combine hits and genes into single fasta
for gene in $(cat genes.txt);
do cat $gene'_results'/$gene'.fasta' $gene'.hits.fasta' > $gene'.aln.fasta'; 
done

In [None]:
#align
for gene in $(cat genes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln' -clw;
done

In [None]:
#output for trees
for gene in $(cat genes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln.tre.fasta';
done

In [None]:
#make protein trees
for gene in $(cat genes.txt);
do FastTree $gene'.aln.tre.fasta' > $gene'.aln.tre';
done

In [None]:
#clean up output
for gene in $(cat genes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

## Look at results

### PolyubiquitinA

In [7]:
!cat PolyubiquitinA_results/PolyubiquitinA.aln

MUSCLE (3.8) multiple sequence alignment


PolyubiquitinA        --------------MQIFVKTLTGKTITLEVEASDTIENVKAKIQDKEGI----------
HOVITM_103732-T1      MSYKNGVNTDAQEQLTIFIETLTGTTFEVKVSPQDRVKTIKSKIQKVEGIPVSHQHLLYN
HOVITM_024119-T1      ---------------------------MIQVVP---------------------------
                                                  ::* .                           

PolyubiquitinA        --------------------------------PPD-------------QQRLIFAGKQLE
HOVITM_103732-T1      SKELQDSSCVTEPSVALHDQATVKLVLSLRGGPISLVHQAVPFNRNILKNLLKFNREELE
HOVITM_024119-T1      --------------------------------PPTMLPTFVPLVTQMHQPLLMHQAEERV
                                                      *               :  * .  ::  

PolyubiquitinA        D----GRTLSDYNIQKESTLHL--------------------------------------
HOVITM_103732-T1      EDLPPGCKMAILVLRVGDQLNLLHIVDDVDDSHDSLSNSRENLSIDSLEDEISKSVEENS
HOVITM_024119-T1      SE-PPAKKM-----RTEESLIPEHVF---------LGKTKSPVTF---------------
                      .   

Doesn't look like either is particularly good hit, as expected

In [11]:
!cat Actin1_results/Actin1.aln

MUSCLE (3.8) multiple sequence alignment


HOVITM_100357-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
HOVITM_022703-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022702-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITM_043299-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                  ****************

HOVITM_100357-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITM_022703-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITM_022702-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
Actin1                SKRGVLTVKYPIEHGIVSNWDDMEKIWHHTFYNELRIAPEEHPVLLTEAPLNPKANREKM
HOVITM_043299-T1      SKRGILTVKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
                      ****:*

In [12]:
!cat Actin2_results/Actin2.aln

MUSCLE (3.8) multiple sequence alignment


Actin2                MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_100357-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
HOVITM_022703-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022702-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_043299-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                  ****************

Actin2                SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITM_100357-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITM_022703-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITM_022702-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITM_043299-T1      SKRGILTVKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
                      ******

Combine Actin1 & 2 and re-do alignment

In [None]:
!cat Actin1_results/Actin1.fasta Actin1_results/Actin1.hits.fasta Actin2_results/Actin2.fasta > Actin.combined.fasta

In [None]:
#align
muscle -in Actin.combined.fasta -out Actin.combined.aln -clw

#align and get tree
muscle -in Actin.combined.fasta -out Actin.combined.aln.tre.fasta
FastTree Actin.combined.aln.tre.fasta > Actin.combined.aln.tre

#clean up
mkdir Actin.combined.results
mv Actin.combined* Actin.combined.results/

In [14]:
#look at combined results
!cat Actin.combined.results/Actin.combined.aln

MUSCLE (3.8) multiple sequence alignment


HOVITM_100357-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
Actin2                MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022703-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022702-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITM_043299-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                  ****************

HOVITM_100357-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
Actin2                SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITM_022703-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITM_022702-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
Actin1                SKRGVL

In [20]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
Phylo.draw_ascii(tree)

                                                       , HOVITM_100357-T1
  _____________________________________________________|
 |                                                     | Actin2
 |
_|_____ HOVITM_022702-T1
 |
 |     ____________ HOVITM_022703-T1
 |____|
      |                        ______________________________ Actin1
      |_______________________|
                              |___________ HOVITM_043299-T1



In [18]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
#root to Actin1
tree.root_with_outgroup({"name": "Actin1"}) 
Phylo.draw_ascii(tree)

                                                            , HOVITM_100357-T1
                                 ___________________________|
                              __|                           | Actin2
                             |  |
                 ____________|  |__ HOVITM_022702-T1
                |            |
  ______________|            |_____ HOVITM_022703-T1
 |              |
_|              |_____ HOVITM_043299-T1
 |
 | Actin1



In [19]:
tree = Phylo.read("Actin.combined.results/Actin.combined.aln.tre", "newick")
#root to Actin2 
tree.root_with_outgroup({"name": "Actin2"}) 
Phylo.draw_ascii(tree)

                              ___ HOVITM_022702-T1
  ___________________________|
 |                           |   ______ HOVITM_022703-T1
 |                           |__|
 |                              |             _______________ Actin1
 |                              |____________|
 |                                           |_____ HOVITM_043299-T1
_|
 | HOVITM_100357-T1
 |
 | Actin2



I wonder if there is an Actin 3 / 4??? 
Going to check white fly database and if so download

In [24]:
#combine with alt actins
!cat Actin.combined.results/Actin.combined.fasta data/whitefly_other_Actin.fasta > Actin.combined.results/Actin.all.fasta

In [None]:
#align
muscle -in Actin.combined.results/Actin.all.fasta -out Actin.combined.results/Actin.all.aln -clw

#align and get tree
muscle -in Actin.combined.results/Actin.all.fasta -out Actin.combined.results/Actin.all.aln.tre.fasta
FastTree Actin.combined.results/Actin.all.aln.tre.fasta > Actin.combined.results/Actin.all.aln.tre

In [25]:
!cat Actin.combined.results/Actin.all.aln

MUSCLE (3.8) multiple sequence alignment


Actin_Bta10561        -MTGRLPA---CVIDVGTGYTKLGFACNKEPQFIIPSAIAIKETAKVGEQSIRRLTKGVE
Actin_Bta01470        MEAYDVIANQPVVIDNGSGVIKAGFAGDQVPKCCFPNY--------VGRPKHVRVMAGAL
HOVITM_100357-T1      -------------------------------------------------------MVGMG
Actin2                MCDDDVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
HOVITM_022703-T1      MCDDEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
HOVITM_022702-T1      MCDEEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
Actin4                MCDEEVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRHQGVMVGMG
Actin1                MCDDELAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRYQGIMVGMG
HOVITM_043299-T1      MCDDDVAA---LVVDNGSGMCKAGFAGDDAPRAVFPSI--------VGRPRYQGIMVGMG
                                                                               *  

Actin_Bta10561        DLDFYIGDEATNAKG-YAVKYPVRHGLVEDWDLMEKFLEQCI-FKYLRADPEDHYFLLTE
Actin_Bta01470        EGEEFV

In [26]:
tree = Phylo.read("Actin.combined.results/Actin.all.aln.tre", "newick")
Phylo.draw_ascii(tree)

               _____________________________________________ Actin_Bta10561
  ____________|
 |            |______________________ Actin_Bta01470
 |
 , HOVITM_100357-T1
 |
_| Actin2
 |
 , HOVITM_022703-T1
 |
 |, Actin1
 ||
 || HOVITM_043299-T1
 |
 , HOVITM_022702-T1
 |
 | Actin4



In [27]:
#just including Actin4 
#combine with alt actins
!cat Actin.combined.results/Actin.combined.fasta data/Actin4.fasta > Actin.combined.results/Actin.with4.fasta

In [None]:
#align
muscle -in Actin.combined.results/Actin.with4.fasta -out Actin.combined.results/Actin.with4.aln -clw

#align and get tree
muscle -in Actin.combined.results/Actin.with4.fasta -out Actin.combined.results/Actin.with4.aln.tre.fasta
FastTree Actin.combined.results/Actin.with4.aln.tre.fasta > Actin.combined.results/Actin.with4.aln.tre

In [28]:
!cat Actin.combined.results/Actin.with4.aln

MUSCLE (3.8) multiple sequence alignment


HOVITM_100357-T1      --------------------------------------------MVGMGQKDSYVGDEAQ
Actin2                MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022703-T1      MCDDEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
HOVITM_022702-T1      MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin4                MCDEEVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQ
Actin1                MCDDELAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
HOVITM_043299-T1      MCDDDVAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRYQGIMVGMGQKDSYVGDEAQ
                                                                  ****************

HOVITM_100357-T1      SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
Actin2                SKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPILLTEAPLNPKANREKM
HOVITM_022703-T1      SKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKM
HOVITM_022702-T1      SKRGIL

In [29]:
tree = Phylo.read("Actin.combined.results/Actin.with4.aln.tre", "newick")
Phylo.draw_ascii(tree)

 , HOVITM_022702-T1
 |
 | Actin4
_|
 |                                                     , HOVITM_100357-T1
 |     ________________________________________________|
 |    |                                                | Actin2
 |____|
      |      __________ HOVITM_022703-T1
      |_____|
            |                     ___________________________ Actin1
            |____________________|
                                 |__________ HOVITM_043299-T1



In [31]:
tree = Phylo.read("Actin.combined.results/Actin.with4.aln.tre", "newick")
#root to Actin2 
tree.root_with_outgroup({"name": "Actin2"}) 
Phylo.draw_ascii(tree)

                                 , HOVITM_022702-T1
                              ___|
                             |   | Actin4
  ___________________________|
 |                           |   ______ HOVITM_022703-T1
 |                           |__|
 |                              |             _______________ Actin1
 |                              |____________|
 |                                           |_____ HOVITM_043299-T1
_|
 | HOVITM_100357-T1
 |
 | Actin2



Actin2 = HOVITM_100357
Actin4 = HOVITM_022702
Actin1 = HOVITM_043299

And possible putative Actin HOVITM_022703

Need to reverse search uniprot or similar to confirm, may provide some info about that putative one since didn't match any from white fly

### PolyUbiquitinA with additonal genes from white fly


In [None]:
!awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}'
< data/Poly.fasta > data/Poly.fixed.fasta

In [None]:
phmmer --tblout PolyubiquitinA_all_phmmer_out_table -o PolyubiquitinA_all_phmmer_out data/Poly.fixed.fasta data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins.fixed.fa          

In [2]:
mkdir PolyubiquitinA_all_results
mv PolyubiquitinA_all* PolyubiquitinA_all_results

SyntaxError: invalid syntax (<ipython-input-2-96d0e9e9eab3>, line 1)

In [1]:
!cat PolyubiquitinA_all_results/PolyubiquitinA_all_phmmer_out_table

#                                                                  --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name              accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ----------    -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_103732-T1     -          PolyubiquitinA_Bta02851 -              8e-12   47.6   0.0   1.5e-11   46.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_103732
HOVITM_024119-T1     -          PolyubiquitinA_Bta02851 -            1.8e-08   36.6   0.0   3.3e-08   35.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_024119
HOVITM_100390-T1     -          PolyubiquitinA_Bta02851 -            0.00036   22.6   0.0   0.00064   21.7   0.0   1.3   1   0   0   1   1   1   1 HOVITM_100390
HOVITM_087185-T1     -    

HOVITM_103732, HOVITM_024119

same two hits as before that weren't super great

In [None]:
#combine with prev polyubiq
!cat data/Poly.fixed.fasta PolyubiquitinA_results/PolyubiquitinA.hits.fasta > PolyubiquitinA_all_results/PolyubiquitinA_all.fasta

In [None]:
#align
muscle -in PolyubiquitinA_all_results/PolyubiquitinA_all.fasta -out PolyubiquitinA_all_results/PolyubiquitinA_all.aln -clw

#align and get tree
muscle -in PolyubiquitinA_all_results/PolyubiquitinA_all.fasta -out PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre.fasta
FastTree PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre.fasta > PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre

In [2]:
!cat PolyubiquitinA_all_results/PolyubiquitinA_all.aln

MUSCLE (3.8) multiple sequence alignment


PolyubiquitinA_FBpp0073035      MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN
PolyubiquitinA_Bta02851         ------------------------------------------------------------
PolyubiquitinA_Bta02020         MQIFVKTLTGKTITLEVEASDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN
PolyubiquitinA_Ssa02069         ------------------------------------------------------------
HOVITM_103732-T1                ------------------------------------------------------------
HOVITM_024119-T1                ------------------------------------------------------------
                                                                                            

PolyubiquitinA_FBpp0073035      IQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLI
PolyubiquitinA_Bta02851         ------------------------------------------------------------
PolyubiquitinA_Bta02020         IQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEASDTIENVKAKIQDKEGIPPDQQRLI
Polyubiquiti

In [5]:
tree = Phylo.read("PolyubiquitinA_all_results/PolyubiquitinA_all.aln.tre", "newick")
Phylo.draw_ascii(tree)

 , PolyubiquitinA_Ssa02069
 |
 | PolyubiquitinA_Bta02020
 |
_| PolyubiquitinA_Bta02851
 |
 |                             ____________________ HOVITM_103732-T1
 |____________________________|
 |                            |_____________ HOVITM_024119-T1
 |
 | PolyubiquitinA_FBpp0073035



# Additional promoters 

In [None]:
for gene in $(cat moregenes.txt);
do phmmer --tblout $gene'_phmmer_out_table' -o $gene'_phmmer_out' data/$gene'.fasta' data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins.fixed.fa;
done  

In [None]:
#clean up
for gene in $(cat moregenes.txt);
do mkdir $gene'_results';
done

In [None]:
for gene in $(cat moregenes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

In [3]:
!cat exu_results/exu_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_102745-T1     -          exu                  -            4.1e-82  279.3   1.1   5.5e-82  278.9   1.1   1.0   1   0   0   1   1   1   1 HOVITM_102745
HOVITM_102745-T1     -          exu                  -            4.2e-60  206.8  10.9   9.1e-47  162.9   0.0   2.0   1   1   1   2   2   2   2 HOVITM_102745
#
# Program:         phmmer
# Version:         3.3.1 (Jul 2020)
# Pipeline mode:   SEARCH
# Query file:      data/exu.fasta
# Target file:     data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins

HOVITM_102745

In [4]:
!cat vasa_results/vasa_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_064625-T1     -          vasa                 -            2.3e-89  303.4   0.0   2.3e-89  303.4   0.0   2.0   2   0   0   2   2   1   1 HOVITM_064625
HOVITM_092677-T1     -          vasa                 -              3e-89  303.0   0.1   3.5e-89  302.7   0.1   1.0   1   0   0   1   1   1   1 HOVITM_092677
HOVITM_064626-T1     -          vasa                 -            5.7e-89  302.0   0.0   7.7e-89  301.6   0.0   1.1   1   0   0   1   1   1   1 HOVITM_064626
HOVITM_065671-T1     -          vasa        

HOVITM_064625, HOVITM_092677, HOVITM_064626, HOVITM_065671,HOVITM_011049, HOVITM_021498, HOVITM_011853

In [5]:
!cat tubulin_results/tubulin_phmmer_out_table

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
HOVITM_074970-T1     -          tubulin              -           6.3e-273  907.6   0.2  7.8e-273  907.3   0.2   1.0   1   0   0   1   1   1   1 HOVITM_074970
HOVITM_075967-T1     -          tubulin              -           1.2e-262  873.8   0.1  1.8e-262  873.2   0.1   1.2   1   0   0   1   1   1   1 HOVITM_075967
HOVITM_075081-T1     -          tubulin              -           9.8e-243  808.1   0.0  1.3e-242  807.7   0.0   1.0   1   0   0   1   1   1   1 HOVITM_075081
HOVITM_049113-T1     -          tubulin     

HOVITM_074970, HOVITM_075967, HOVITM_075081, HOVITM_049113, HOVITM_099791, HOVITM_094615, HOVITM_007409

In [7]:
!cat CandidateList_moregenes.txt

exu	HOVITM_102745
vasa	HOVITM_064625
vasa	HOVITM_092677
vasa	HOVITM_064626
vasa	HOVITM_065671
vasa	HOVITM_011049
vasa	HOVITM_021498
vasa	HOVITM_011853
tubulin	HOVITM_074970
tubulin	HOVITM_075967
tubulin	HOVITM_075081
tubulin	HOVITM_049113
tubluin	HOVITM_099791
tubluin	HOVITM_094615

In [None]:
#split by gene name
for gene in $(cat moregenes.txt);
do grep $gene CandidateList_moregenes.txt | cut -f 2 > $gene'.hits.txt';
done

In [None]:
#get sequences for hits
for gene in $(cat moregenes.txt);
    for hit in $(cat $gene'.hits.txt');
    do grep -A 1 '>'$hit data/Homalodisca_vitripennis_A6A7A9_masurca_v1.proteins.fixed.fa | sed '/^--$/d' >> $gene'.hits.fasta';
done

In [None]:
#combine hits and genes into single fasta
for gene in $(cat moregenes.txt);
do cat data/$gene'.fasta' $gene'.hits.fasta' > $gene'.aln.fasta'; 
done

In [None]:
#align
for gene in $(cat moregenes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln' -clw;
done

In [None]:
#output for trees
for gene in $(cat moregenes.txt);
do muscle -in $gene'.aln.fasta' -out $gene'.aln.tre.fasta';
done

In [None]:
#make protein trees
for gene in $(cat moregenes.txt);
do FastTree $gene'.aln.tre.fasta' > $gene'.aln.tre';
done

In [None]:
#clean up output
for gene in $(cat moregenes.txt);
do mv $gene* $gene'_results';
done
#will error about the results folders but thats OK

In [30]:
!cat exu_results/exu.aln

MUSCLE (3.8) multiple sequence alignment


exu_FBpp0085555         MVADNIDAGVAIAVADQSSSPVGDKVELPA-----GNYILVGVDIDTTGRRLMDEIVQLA
exu_ACYPI005096-PA      MV---------------QNTEIVDVSNNTVTLDNLSDYMVIGWDIDTTGRRLLDEICHIA
HOVITM_102745-T1        MV---------------STDKVVPKAGLPA-----GDYKIVAWDLDTTGRRLIDEFCHVA
                        **               ..  :      ..     .:* ::. *:*******:**: ::*

exu_FBpp0085555         AYTPTDHFEQYIMPYMNLNPAARQRHQVRVISIGFYRMLKSMQTYKIIKSKSEIAALKDF
exu_ACYPI005096-PA      GHTPNSQFNQYIMPHNDIDQVARRRHLLCTITMGRFRALKDIKNNKTLKSKSEISALAEF
HOVITM_102745-T1        GYTPEDKFSQYVMPYKDLDLISKRRHQVRTVTVGKYRMLKDLKTGKFLKTKSEISALTDF
                        .:** .:*.**:**: :::  :..** : .:::* :* **.::. * :*:****:** :*

exu_FBpp0085555         LNWLEQLKTKAGPSSDGIVLIYHEERKFIPYMILESLKKYGLLERFTASVKSFANSINLA
exu_ACYPI005096-PA      IEWLEQMLKDNGKKMA--ILVCHEVSKFNTCLLIKSLLAYNLLDKFSEVVKGFANCHSFA
HOVITM_102745-T1        LTWLEKVK---GDSKSGIILLNFESFKLAPSLLLEALKKYQLLDRFTNVVKGFSDCYAYV
    

In [31]:
tree = Phylo.read("exu_results/exu.aln.tre", "newick")
#root to fly copy 
tree.root_with_outgroup({"name": "exu_FBpp0085555"}) 
Phylo.draw_ascii(tree)

                                  ________________________ exu_ACYPI005096-PA
  _______________________________|
_|                               |__________________________ HOVITM_102745-T1
 |
 | exu_FBpp0085555



In [32]:
!cat vasa_results/vasa.aln

MUSCLE (3.8) multiple sequence alignment


HOVITM_011853-T1      ----------------------MDNYSDIDRKRPEPPIKRYRRNDESGSSASDDDSYVPY
vasa_FBpp0401446      MSDDWDDEP----------IVDTRGARGGDWSDDEDTAKSFSGEAEGDGVGGSGGEGGGY
vasa_FBpp0401447      MSDDWDDEP----------IVDTRGARGGDWSDDEDTAKSFSGEAEGDGVGGSGGEGGGY
HOVITM_092677-T1      ------------------------------------------------------------
HOVITM_064625-T1      ------------------------MERSYNNQNGSNSRPSY-------GSGRPGGKFGGG
HOVITM_064626-T1      ----------------------MIGYVSRLFSNTSKWSPSIYKETSLVNISNTTVCMNLH
HOVITM_021498-T1      --------------------------RPRDRTTVEDKELNKDKEKEGEAIKERYLGLVKK
HOVITM_065671-T1      MSSNRGGDNKPKGFGFSGFQVKSRSERANALPPPPNSALSKQGYSTMTAITNNALSASWG
HOVITM_011049-T1      ------------------------------------------------------------
                                                                                  

HOVITM_011853-T1      VPVKERKKQQMMKL-------------------------------GRLTQLKEEF-----
vasa_FBpp0401446      QGGNRD

In [33]:
tree = Phylo.read("vasa_results/vasa.aln.tre", "newick")
Phylo.draw_ascii(tree)

  ___________________________________________ HOVITM_011853-T1
 |
 |______________________________ HOVITM_021498-T1
 |
_|                    ___________________________________ HOVITM_065671-T1
 |          _________|
 |         |         |________________________ HOVITM_011049-T1
 |         |
 |_________|                                              , vasa_FBpp0401446
           |                             _________________|
           |      ______________________|                 | vasa_FBpp0401447
           |     |                      |
           |_____|                      |__________________ HOVITM_092677-T1
                 |
                 |                     ________ HOVITM_064625-T1
                 |____________________|
                                      |___________ HOVITM_064626-T1



Maybe HOVITM_092677

In [34]:
!cat tubulin_results/tubulin.aln

MUSCLE (3.8) multiple sequence alignment


tubuluin_tub2_FBpp0076678           -----------------------------------------MREIVTLQIGGAG-----N
HOVITM_075081-T1                    ----------------------------------------MMTCIIVIKV----------
HOVITM_094615-T1                    -----------------------------------MSLPVSPCPQCTCRTTPCSPRPDDA
HOVITM_099791-T1                    ------------------------------------------------------------
tubulin_beta97ef_FBpp0289838        -----------------------------------------MREIVHLQAGQCG-----N
tubulin_beta97ef_CLEC007800         -----------------------------------------MREIVHLQAGQCG-----N
HOVITM_075967-T1                    MNHTQSDFYGQFYWGPTSGVTRQSGASSPAACCNAHFLSAPLTPSLHLSIPTRGWW---N
tubulin_beta60d_ACYPI001007-PA      -----------------------------------------MREIVHLQAGQCG-----N
HOVITM_049113-T1                    ------------------------------------------------------------
tubulin_beta60d_FBpp0072177         ------------------------------------

In [37]:
tree = Phylo.read("tubulin_results/tubulin.aln.tre", "newick")
#root to fly copy 
tree.root_with_outgroup({"name": "tubuluin_tub2_FBpp0076678"}) 
Phylo.draw_ascii(tree)

                                , tubulin_beta85d_AAEL002851
                                |
                                , tubulin_beta56d2_FBpp0085720
                                |
                                |_ tubulin_beta56d_FBpp0085721
                               ,|
                               || HOVITM_074970-T1
                               ||
                               || tubulin_beta56d_ACYPI008874-PA
                               |
                               | ___ HOVITM_075967-T1
                              ,||
                              ||| tubulin_beta85d_FBpp0081524
                              ||
                              || __ tubulin_beta97ef_CLEC007800
                              |||
                              | | __ tubulin_beta97ef_FBpp0289838
                              | ||
                              |  |___________ HOVITM_094615-T1
                              |
                             _|_____ HOVITM_099791-

HOVITM_049113 = 60d
HOVITM_075967 = 85d
HOVITM094615 = beta97ef 
HOVITM_074970 = 56d
HOVITM_075081 = tub2, maybe 