/
collect_network_genbanks.pl
161 lines (134 loc) · 2.88 KB
/
collect_network_genbanks.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/perl
use 5.14.0;
use utf8;
use Carp;
use File::Basename;
use File::Spec;
use DBI;
use Bio::SeqIO;
use Bio::Seq;
use List::Util qw(reduce any all none notall first
max maxstr min minstr product sum sum0
pairs unpairs pairkeys pairvalues pairfirst pairgrep pairmap
shuffle uniq uniqnum uniqstr
);
use File::Find;
use File::Copy;
use File::Path qw(make_path);
# {{{ Getopt::Long
use Getopt::Long;
# my $conffile = qq(local.conf); # %conf is not used.
my $help;
GetOptions (
# "conffile:s" => \$conffile,
"help" => \$help
);
# }}}
if($help) {
exec("perldoc $0");
exit;
}
=head1 Name
collect_network_genbanks.pl
=head2 Example
perl /home/sco/mnt/smoke/docker/ripper/collect_network_genbanks.pl
=head2 Description
Make sure you change to the directory pna before running.
=cut
my %ffoptions = (
wanted => \&onfind,
no_chdir => 1
);
my %netgbks;
find(\%ffoptions, ".");
for my $key (keys %netgbks) {
my @netlist = @{$netgbks{$key}};
tablist($key, @netlist);
my $outdir = "../Networks/Network" . $key;
unless(-d $outdir) { make_path($outdir); }
my $ogbkpath = "../orgnamegbk";
opendir(OGBK, $ogbkpath);
my @ogbks = readdir(OGBK);
for my $ngbk (@netlist) {
for my $ogbk (@ogbks) {
if($ogbk =~ m/$ngbk/) {
copy(File::Spec->catfile($ogbkpath, $ogbk), $outdir);
}
}
}
}
exit;
# {{{ sub onfind
sub onfind {
my $fp = $_;
if(-d $fp) { return; }
else {
my ($noex, $dir, $ext)= fileparse($fp, qr/\.[^.]*/);
my $bn = $noex . $ext;
if($ext =~ /faa$/ and $noex =~ m/p_cc\d+/) {
my ($netnum) = $noex =~ m/p_cc(\d+)/;
my @gbks = gbknames($fp);
$netgbks{$netnum} = \@gbks;
}
}
}
# }}}
# {{{ sub gbknames
sub gbknames {
my $ifn = shift(@_);
my @retlist;
my $seqio = Bio::SeqIO->new(-file => $ifn, -format => 'fasta');
while(my $seqobj = $seqio->next_seq()) {
my $id = $seqobj->display_id();
my @id = split(/\|/, $id);
my $gbk = $id[1];
$gbk =~ s/_\d+$//;
unless ( any {$_ eq $gbk} @retlist ) {
push(@retlist, $gbk);
}
}
return(@retlist);
}
# }}}
# {{{ sub sorter
sub sorter {
my $alpha = $a;
my $beta = $b;
my ($astr) = $alpha =~ m/^(\D+)/;
my ($bstr) = $beta =~ m/^(\D+)/;
my ($anum) = $alpha =~ m/(\d+)$/;
my ($bnum) = $beta =~ m/(\d+)$/;
# tablistE($astr, $bstr, $anum, $bnum);
if($astr eq $bstr) {
return($anum <=> $bnum);
}
else {
return(lc($astr) cmp lc($bstr));
}
}
# }}}
# {{{ subroutines tablist, linelist, tabhash and their *E versions.
# The E versions are for printing to STDERR.
#
sub tablistH {
my @in = @_;
my $fh = shift(@in);
print($fh join("\t", @in), "\n");
}
sub tablist {
my @in = @_;
print(join("\t", @in), "\n");
}
sub tablistE {
my @in = @_;
print(STDERR join("\t", @in), "\n");
}
sub linelist {
my @in = @_;
print(join("\n", @in), "\n");
}
sub linelistE {
my @in = @_;
print(STDERR join("\n", @in), "\n");
}
# }}}