-
Notifications
You must be signed in to change notification settings - Fork 0
/
proteinfeatures_imgmv2.pl
99 lines (91 loc) · 3.89 KB
/
proteinfeatures_imgmv2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#! /usr/bin/perl -w
# perl code to convert the NCBI Web Batch Conserved Domain Search (CD-Search) output for data analytics
# including visual analytics, modeling and simulation and statistical analysis.
# Usage example: proteinfeatures_imgm.pl <input_filename> > <ouput file name> &
# Author: Raphael D. Isokpehi (qeubic -at- gmail.com)
# https://github.com/qeubic/protein_features/blob/main/README.md
# NCBI Web Batch CD-Search website - https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi
# Input file should contain a collection of fasta protein sequences obtained from
# the Integrated Microbial Genomes & Microbiomes - https://img.jgi.doe.gov/cgi-bin/mer/main.cgi
# Example - Desulfofvibrio desulfuricans ND132 Protein Coding Genes
# 3470 protein coding gene(s) retrieved.
# https://img.jgi.doe.gov/cgi-bin/mer/main.cgi?section=TaxonDetail&page=proteinCodingGenes&taxon_oid=2503754015
# Limitation: NCBI CD-Search accepts maximum of 4000 protein sequences per request.
# The FASTA headers in the sequences from the IMG/M vary in the display of the Genome Name.
# Identifiers may precede the Genome Name in some FASTA sequence headers.
# This version of the perl script extracts the content for the Genome Name field to include the square brackets ([]).
$filename = $ARGV[0]; #Check for input file
if ($#ARGV!=0) {
die "Usage: enter file name\n"; #stop code if filename is not provided as input
}
#Assign data field names to variables
$a="Query Identifier";
$b1="Gene ID";
$b2="Locus Tag";
$b3="Gene Name";
$b4="Genome Name";
$c="Ligand Type";
$d="Ligand Name";
$e="Amino Acid Pattern";
$f="Amino Acid Position Pattern";
$g="Ligand Site Positions";
$h="Source Protein Domain (PSSM-ID)";
$i="Coordinates";
# Print the data fields separated as tabs.
print "$a\t$b1\t$b2\t$b3\t$b4\t$c\t$d\t$e\t$f\t$g\t$h\t$i\n";
# Open the file with the feature output from CD-Search
open (IN, $filename);
while (<IN>) {
chomp;
#An example of line to be matched
#Q#3470 - >2503784787 DND132_0001 dnaA chromosomal replication initiator protein [Desulfovibrio desulfuricans ND132 (Final JGI assembly) : Ddes_Contig54] specific DnaA box-binding interface R386,R394,N395,R402,S408,L409,K410,K419,H420,S421,T422,K425,T428,K429 15 14 119330
if (/^Q\#(.*?)\s\-\s(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)/) { #match line with Query
#assign identifiers to variables matched in the query (protein sequence)
$qid=$1;
$domain_type=$3;
$domain_name=$4;
$bindsites=$5;
$sourcedomain=$8;
$imgids1=$2; #Contain IMG/M identifiers for Gene ID and Locus Tags as well as gene and genome descriptions
$imgids1=~/\>(.*?)\s(.*?)\s(.*?)(\[.*?\])/; # matching of identifiers and genome name with the square brackets
$geneid=$1;
$locus_tag=$2;
$genename=$3;
$genomename=$4;
$label1="$qid\t$geneid\t$locus_tag\t$genename\t$genomename\t$domain_type\t$domain_name"; #Assign variable for printing
#constructing data fields for amino acid patterns and amino acid position patterns.
if ($bindsites=~/(\S)(.*)\-(\S)(.*)/) { # repeats or coils
$repaa1=$1;
$repaapos1=$2;
$repaa2=$3;
$repaapos2=$4;
push (@arr3, $repaa1);
push (@arr3, $repaa2);
push (@arr4, $repaapos1);
push (@arr4, $repaapos2);
} else {
@arr2=split(/\,/, $bindsites); #binding sites
$cntligand=@arr2;
$cnt=0;
while ($cnt < $cntligand) {
$sitenum=$cnt+1; #ligand site number
$siteloc=$arr2[$cnt];
$siteloc=~/(\S?)(.*)/;
$siteaa=$1;
$sitepos=$2;
push (@arr3, $siteaa); # amino acid at sites
push (@arr4, $sitepos); # amino acid positions
$cnt++;
}
}
$siteaapa=join "", @arr3; #join the amino acid letters to form a pattern
$sitepospa=join "_", @arr4; #join the amino acid positions with underscore to form a pattern
print "Q$label1\t$siteaapa\t$sitepospa\t$sitenum\t$sourcedomain\t$bindsites\n"; # output record
$sitenum=0;
}
#reinitialize arrays.
@arr2=();
@arr3=();
@arr4=();
}
#end of code