# Get Uniprot accession numbers to identify enriched functions with DAVID 

In [1]:
module load gcc/6.2.0
module load bedtools/2.29.0
module load gnutls/3.5.9
module load boost/1.55.0
module load blast/2.6.0

## GWAS genes

In [10]:
# GWAS significant genes
head -n 3 GWAS/GWAS_width_lfmmLassoP01.genes2kb

Contig60108	maker	gene	41	11731	.	-	.	ID=OLUR_00014779;Name=OLUR_00014779;Alias=maker-Contig60108-snap-gene-0.2;Note=Similar to MLH3: DNA mismatch repair protein Mlh3 (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:3.30.230.10,Gene3D:G3DSA:3.30.565.10,InterPro:IPR013507,InterPro:IPR014721,InterPro:IPR020568,InterPro:IPR036890,MobiDBLite:mobidb-lite,Pfam:PF01119,SMART:SM01340,SUPERFAMILY:SSF54211,SUPERFAMILY:SSF55874;Ontology_term=GO:0005524,GO:0006298,GO:0030983;	Contig60108	2787	2787	0.00959340726719418
Contig42157	maker	gene	2034	10762	.	-	.	ID=OLUR_00019115;Name=OLUR_00019115;Alias=snap_masked-Contig42157-processed-gene-0.0;Note=Protein of unknown function;Dbxref=Coils:Coil,MobiDBLite:mobidb-lite;	Contig42157	9907	9907	0.00647897493033771
Contig58976	maker	gene	224	2577	.	-	.	ID=OLUR_00018983;Name=OLUR_00018983;Alias=maker-Contig58976-snap-gene-0.2;Note=Similar to cpiB: Cystatin-A2 (Dictyostelium discoideum OX%3D44689);Dbxref=Gene3D:G3DSA:3.10.450.10,InterPro:IPR000010,InterPro:IPR0017

In [11]:
cat GWAS/GWAS_width_lfmmLassoP01.genes2kb \
| cut  -f1,4,5 | sort | uniq > GWAS/GWAS_width_lfmmLassoP01-BED.genes2kb

In [15]:
wc -l GWAS/GWAS_width_lfmmLassoP01-BED.genes2kb

37 GWAS/GWAS_width_lfmmLassoP01-BED.genes2kb


In [13]:
# Background GWAS genes (should be background for all analyses?)
cat GWAS/GWAS_width_lfmmLasso.genes2kb \
| cut  -f1,4,5 | sort | uniq > GWAS/GWAS_width_lfmmLasso-BED.genes2kb

In [14]:
tail GWAS/GWAS_width_lfmmLasso-BED.genes2kb

Contig973	10702	17477
Contig97360	1319	3169
Contig97510	4167	7322
Contig97731	5751	12595
Contig98	13288	46306
Contig98	2535	11398
Contig98085	682	7530
Contig99041	298	1646
Contig99082	822	13998
Contig99323	6816	14674


Make fastas

In [17]:
fastaFromBed \
-fi ../../Olurida_v081.fa \
-bed GWAS/GWAS_width_lfmmLassoP01-BED.genes2kb \
> GWAS/GWAS_width_lfmmLassoP01-genes2kb.fa

index file ../../Olurida_v081.fa.fai not found, generating...


In [18]:
grep ">" -c GWAS/GWAS_width_lfmmLassoP01-genes2kb.fa

37


In [19]:
fastaFromBed \
-fi ../../Olurida_v081.fa \
-bed GWAS/GWAS_width_lfmmLasso-BED.genes2kb \
> GWAS/GWAS_width_lfmmLasso-genes2kb.fa

In [20]:
grep ">" -c GWAS/GWAS_width_lfmmLasso-genes2kb.fa

1302


### Blast gene fasta files against uniprot  
Uniprot/Swiss-prot reviewed database downloaded on 04/22/2020 from https://www.uniprot.org/downloads

In [28]:
module load gcc/6.2.0
module load gnutls/3.5.9
module load boost/1.55.0
module load blast/2.10.0


The following have been reloaded with a version change:
  1) blast/2.9.0 => blast/2.10.0



In [2]:
makeblastdb \
-in ../../uniprot_sprot.fasta \
-dbtype prot \
-out ../../uniprot_sprot_042220.fasta



Building a new DB, current time: 04/23/2020 16:24:41
New DB name:   /scratch/t.cri.ksilliman/CommonG_cp2/2019_Mapping/uniprot_sprot_042220.fasta
New DB title:  ../../uniprot_sprot.fasta
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 562253 sequences in 18.284 seconds.


In [5]:
blastx \
-query GWAS/GWAS_width_lfmmLassoP01-genes2kb.fa \
-db ../../uniprot_sprot_042220.fasta \
-num_threads 6 \
-evalue 1E-5 \
-outfmt 6 \
-max_target_seqs 1 \
-out GWAS/GWAS_width_lfmmLassoP01-genes2kb_blastx.tab

In [7]:
wc -l GWAS/GWAS_width_lfmmLassoP01-genes2kb_blastx.tab

71 GWAS/GWAS_width_lfmmLassoP01-genes2kb_blastx.tab


In [8]:
# background list
blastx \
-query GWAS/GWAS_width_lfmmLasso-genes2kb.fa \
-db ../../uniprot_sprot_042220.fasta \
-num_threads 6 \
-evalue 1E-5 \
-outfmt 6 \
-max_target_seqs 1 \
-out GWAS/GWAS_width_lfmmLassoBackground-genes2kb_blastx.tab

In [9]:
wc -l GWAS/GWAS_width_lfmmLassoBackground-genes2kb_blastx.tab

2591 GWAS/GWAS_width_lfmmLassoBackground-genes2kb_blastx.tab


### Get Uniprot accession and load into DAVID

In [14]:
tr '|' '\t' < GWAS/GWAS_width_lfmmLassoP01-genes2kb_blastx.tab \
| cut -f3 > GWAS/GWAS_width_lfmmLassoP01-genes2kb_blastx.prots

In [12]:
tr '|' '\t' < GWAS/GWAS_width_lfmmLassoBackground-genes2kb_blastx.tab \
| cut -f3 > background_2brad.txt

### No enrichment for BP, MF, CC for GWAS snps, only 23 DAVID ids

## Fst > 0.3

In [18]:
# Fst > 0.3 genes
# Fst from genes +-2kb, but mapping only to original gene region
head -n 3 Results/fst2kb_g03.genes

Contig29033	maker	gene	13408	30648	.	-	.	ID=OLUR_00003721;Name=OLUR_00003721;Alias=maker-Contig29033-snap-gene-0.4;Note=Similar to G2/mitotic-specific cyclin-B (Hydra viridissima OX%3D6082);Dbxref=CDD:cd00043,Gene3D:G3DSA:1.10.472.10,InterPro:IPR004367,InterPro:IPR006671,InterPro:IPR013763,InterPro:IPR036915,InterPro:IPR039361,MobiDBLite:mobidb-lite,PIRSF:PIRSF001771,Pfam:PF00134,SMART:SM00385,SMART:SM01332,SUPERFAMILY:SSF47954;Ontology_term=GO:0005634;	Contig29033	13407	30648	0.441704035874439
Contig77382	maker	gene	3560	10918	.	+	.	ID=OLUR_00016056;Name=OLUR_00016056;Alias=snap_masked-Contig77382-processed-gene-0.0;Note=Similar to Socs5: Suppressor of cytokine signaling 5 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:3.30.505.10,InterPro:IPR000980,InterPro:IPR036860,MobiDBLite:mobidb-lite,Pfam:PF00017,SUPERFAMILY:SSF55550;	Contig77382	3559	10918	0.663352272727273
Contig34298	maker	gene	15093	51017	.	+	.	ID=OLUR_00001738;Name=OLUR_00001738;Alias=maker-Contig34298-snap-gene-0.4;Note=Si

In [19]:
cat Results/fst2kb_g03.genes \
| cut  -f1,4,5 | sort | uniq > Results/fst2kb_g03.BED

In [20]:
wc -l Results/fst2kb_g03.BED

42 Results/fst2kb_g03.BED


In [21]:
# Background FST genes 
cat Results/fst2kb_background.genes \
| cut  -f1,4,5 | sort | uniq > Results/fst2kb_background.BED

Make fastas

In [22]:
fastaFromBed \
-fi ../../Olurida_v081.fa \
-bed Results/fst2kb_g03.BED \
> Results/fst2kb_g03.fa

In [23]:
grep ">" -c Results/fst2kb_g03.fa

42


In [24]:
fastaFromBed \
-fi ../../Olurida_v081.fa \
-bed  Results/fst2kb_background.BED \
>  Results/fst2kb_background.fa

In [25]:
grep ">" -c  Results/fst2kb_background.fa

1597


### Blast gene fasta files against uniprot  
Uniprot/Swiss-prot reviewed database downloaded on 04/22/2020 from https://www.uniprot.org/downloads

In [None]:
blastx \
-query Results/fst2kb_background.fa \
-db ../../uniprot_sprot_042220.fasta \
-num_threads 6 \
-evalue 1E-5 \
-outfmt 6 \
-max_target_seqs 1 \
-out Results/fst2kb_background_blastx.tab

In [2]:
wc -l Results/fst2kb_background_blastx.tab

3034 Results/fst2kb_background_blastx.tab


In [None]:
# background list
blastx \
-query Results/fst2kb_g03.fa \
-db ../../uniprot_sprot_042220.fasta \
-num_threads 6 \
-evalue 1E-5 \
-outfmt 6 \
-max_target_seqs 1 \
-out Results/fst2kb_g03_blastx.tab

In [5]:
wc -l Results/fst2kb_g03_blastx.tab

78 Results/fst2kb_g03_blastx.tab


### Get Uniprot accession and load into DAVID

In [6]:
tr '|' '\t' < Results/fst2kb_background_blastx.tab \
| cut -f3 > Results/fst2kb_background_blastx.prots

In [7]:
tr '|' '\t' < Results/fst2kb_g03_blastx.tab \
| cut -f3 > Results/fst2kb_g03_blastx.prots

### 4 enriched terms for BP, none for CC, 1 for MF for fst > 0.3 snps, only 19 DAVID ids

In [9]:
head Results/fst2kb_background_blastx.prots

Q96RR4
Q96RR4
Q96RR4
Q96RR4
P27966
P27966
P82596
Q9I8C7
Q96MM6
Q96MM6
