Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
472 lines (435 sloc) 15.2 KB
#! /usr/bin/perl -w
# lambda.pl $Rev$ - An estimator for Dawg's lambda
# Copyright (2004-2005) Reed A. Cartwright. All rights reserved.
#
# Usage: perl lambda.pl [treefile] [fastafile]
#
# Distributed under the same license as DAWG
#
use strict;
# options
my $user_dist = 0; # change to 1 to enable the display of the US gap distribution
my $chisq_bins = 0; # change to 1 to see the chisq bins
# Table for Zipf Estimation [1.01..3.0 by 0.01]
my $ZetaTableS = 1.01;
my $ZetaTableI = 0.01;
my @ZetaTable = (100.57794333849678, 50.57867004101556,
33.91272910377197, 25.580120524770102, 20.58084430203699,
17.248233766955963, 14.868003203314164, 13.083009752064346,
11.694840712912123, 10.584448464950801, 9.676075430586256,
8.91921655749934, 8.2789068089333, 7.730171158143642, 7.254694585068123,
6.83874082424536, 6.4718056722916035, 6.14571919288521,
5.854031435966521, 5.591582441177752, 5.354195097462891,
5.138451768231236, 4.941529188217385, 4.761074636126193,
4.5951118258429435, 4.441968515056512, 4.300220200855159,
4.168645880551769, 4.0461929654417865, 3.931949211809544,
3.8251200846430273, 3.725010365652108, 3.6310091052834443,
3.5425772302621485, 3.45923727555487, 3.380564827672348,
3.306181355544513, 3.2357481733626594, 3.168961332215122,
3.105547277977581, 3.045259144634053, 2.98787357712343,
2.9331879975097537, 2.8810182439474383, 2.8311965244511157,
2.783569637565062, 2.737997420181188, 2.69435138937978,
2.6525135505754367, 2.612375348685488, 2.5738367426903555,
2.5368053869753955, 2.501195905351019, 2.4669292457372296,
2.433932105246206, 2.402136416863166, 2.371478890160818,
2.3419005995261015, 2.3133466142621635, 2.2857656656801293,
2.2591098469359414, 2.2333343419152283, 2.2083971799386637,
2.184259013463686, 2.1608829163060492, 2.1382342002048644,
2.116280247814644, 2.0949903604333366, 2.074335618971391,
2.0542887568377517, 2.0348240435678777, 2.015917178149418,
1.9975451911156323, 1.9796863545771772, 1.962320099451342,
1.9454269392258112, 1.9289883996629098, 1.9129869539112034,
1.897405962545327, 1.882229618102822, 1.867442893729346,
1.853031495581547, 1.8389818186706854, 1.8252809058602786,
1.8119164097580231, 1.7988765572664147, 1.7861501165781457,
1.7737263664218121, 1.7615950673809382, 1.749746435125061,
1.73817111540579, 1.7268601606835563, 1.7158050082623097,
1.7049974598198623, 1.694429662231051, 1.6840940895894492,
1.6739835263411282, 1.6640910514510443, 1.6544100235290304,
1.6449340668482264, 1.6356570581940908, 1.626573114486998,
1.617676581125839, 1.608962021004109, 1.600424204153639,
1.5920580979745251, 1.583858858012912, 1.575821819251114,
1.5679424878771757, 1.560216533503362, 1.5526397818052655,
1.5452082075552418, 1.5379179280257507, 1.5307651967398803,
1.5237463975479324, 1.5168580390103819, 1.5100967490688908,
1.5034592699882794, 1.4969424535535234, 1.4905432565068935,
1.484258736211354, 1.4780860465272285, 1.4720224338900036,
1.4660652335779072, 1.4602118661586483, 1.4544598341053465,
1.4488067185723421, 1.4432501763221333, 1.437787936795243,
1.432417799315324, 1.4271376304222598, 1.4219453613265007,
1.4168389854782233, 1.411816556245341, 1.4068761846947044,
1.4020160374711905, 1.3972343347696727, 1.392529348395166,
1.3878993999066986, 1.3833428588407357, 1.3788581410101926,
1.3744437068753212, 1.370098059982948, 1.3658197454707395,
1.3616073486333566, 1.3574594935475333, 1.3533748417532712,
1.3493520909884928, 1.3453899739746535, 1.341487257250917,
1.3376427400546582, 1.3338552532461516, 1.330123658275428,
1.326446846189378, 1.3228237366772906, 1.319253277153097,
1.3157344418726857, 1.3122662310847368, 1.3088476702135943,
1.3054778090727805, 1.302155721107818, 1.2988805026670913,
1.2956512722995475, 1.2924671700780854, 1.2893273569475512,
1.2862310140962918, 1.283177342350291, 1.2801655615889367,
1.2771949101815299, 1.2742646444436798, 1.2713740381127698,
1.2685223818417177, 1.2657089827102934, 1.262933163753284,
1.2601942635048347, 1.2574916355583214, 1.2548246481411414,
1.2521926837038304, 1.2495951385229584, 1.2470314223172532,
1.2445009578764485, 1.2420031807023697, 1.2395375386617788,
1.2371034916505452, 1.2347005112686997, 1.2323280805059718,
1.2299856934374145, 1.2276728549287421, 1.2253890803510186,
1.2231338953043551, 1.2209068353502854, 1.218707445752503,
1.2165352812256536, 1.2143899056919019, 1.2122708920449774,
1.210177821921451, 1.2081102854789683, 1.2060678811812071,
1.2040502155893134, 1.202056903159594);
my @ZetaRatioTable = ( -99.42465416119387, -49.42651278295983, -32.7616936349154,
-24.430196816907994, -19.432022427470404, -16.100503897177585,
-13.721355607717234, -11.937434796674157, -10.550328859139627,
-9.44099026836784, -8.53366153972204, -7.777837712657673,
-7.1385538410301885, -6.590834987564802, -6.116366220546042,
-5.701411361755471, -5.335466293967182, -5.010361165950622,
-4.719646111688432, -4.458161253833933, -4.221729563328257,
-4.00693348458076, -3.8109498323455355, -3.631425964382572,
-3.4663856726837654, -3.3141567921116177, -3.1733148960102397,
-3.0426390570454855, -2.921076760977479, -2.8077158376812346,
-2.7017618248754207, -2.6025195761543083, -2.5093782130161015,
-2.4217987324177765, -2.3393037387510347, -2.2614688871575184,
-2.187915714416261, -2.118305601798002, -2.052334666712738,
-1.989729420611674, -1.9302430623196227, -1.8736523008927266,
-1.819754621799693, -1.7683659258977484, -1.7193184832130266,
-1.6724591536202988, -1.6276478346710939, -1.58475610344434,
-1.543666024702038, -1.5042691020671974, -1.466465352593545,
-1.4301624881165254, -1.3952751892823985, -1.3617244602415755,
-1.3294370537398257, -1.2983449578075912, -1.268384936482713,
-1.2394981180452773, -1.2116296251275196, -1.1847282418133533,
-1.158746113482776, -1.1336384757041358, -1.1093634089466802,
-1.0858816162892664, -1.06315622164869, -1.0411525863512565,
-1.0198381421311062, -0.9991822388642257, -0.9791560055431779,
-0.9597322231683802, -0.940885208380976, -0.9225907067928715,
-0.9048257950839338, -0.8875687910368727, -0.8707991707687682,
-0.8544974924961801, -0.838645326239606, -0.8232251889339728,
-0.8082204844657707, -0.79361544820532, -0.7793950956451932,
-0.7655451747936654, -0.7520521220058026, -0.7389030209648924,
-0.726085564553807, -0.7135880193799546, -0.7013991927389883,
-0.689508401821763, -0.6779054449863241, -0.6665805749322685,
-0.6555244736287632, -0.6447282288600555, -0.6341833122635604,
-0.6238815587457621, -0.613815147170221, -0.6039765822201625,
-0.5943586773454108, -0.5849545387099745, -0.5757575500624081,
-0.5667613584562456, -0.5579598607523722, -0.5493471908392097,
-0.5409177075100653, -0.5326659829399969, -0.5245867917070516,
-0.5166751003048036, -0.5089260570947702, -0.5013349826484628,
-0.49389736042966514, -0.4866088277678791, -0.47946516707388387,
-0.4724622972478885, -0.46559626522991804, -0.45886323764076514,
-0.45225949246012226, -0.4457814106862988, -0.43942546791927434,
-0.4331882258056473, -0.4270663232803669, -0.4210564675358515,
-0.4151554246442902, -0.4093600097534363, -0.4036670767700985,
-0.39807350743870257, -0.39257619971471763, -0.3871720553243887,
-0.38185796639297176, -0.3766308010135526, -0.3714873876174255,
-0.36642449799486676, -0.3614388288019101, -0.3565269813743003,
-0.35168543965413074, -0.3469105460176611, -0.34219847477435444,
-0.3375452030872263, -0.33294647904299474, -0.3283977865772394,
-0.32389430693462035, -0.3194308763171518, -0.31500193934436266,
-0.3106014979178754, -0.30622305504926456, -0.3018595531739829,
-0.2975033064354322, -0.2931459263818405, -0.2887782404742653,
-0.284390202756679, -0.279970795988474, -0.275507924485767,
-0.2709882968603094, -0.266397297783542, -0.26171884783810045,
-0.25693525044975124, -0.25202702481907663, -0.24697272369407414,
-0.24174873474194597, -0.23632906419054672, -0.23068510131703507,
-0.2247853622629635, -0.21859521155122816, -0.21207655957064997,
-0.20518753417837918, -0.1978821244484786, -0.19010979446683013,
-0.18181506493763538, -0.17293706022511063, -0.1634090183052305,
-0.15315776094644973, -0.1421031212749423, -0.13015732570897476,
-0.11722432706829741, -0.10319908547787622, -0.08796679349060989,
-0.07140204165094988, -0.053367920510283995, -0.03371505488566567,
-0.012280565925787676, 0.011113043687890184, 0.03665908632130593,
0.06456797291826083, 0.09506858891097204, 0.1284097682092414,
0.16486187124327872, 0.20471847331934726, 0.24829816983861908,
0.29594650522777877, 0.3480380327342986, 0.40497851254960615,
0.46720725603869884, 0.5351996241746824, 0.6094696886000213,
0.6905730640625012, 0.7791099213014138, 0.8757281897876625,
0.9811269600483958, 1.0960600956316162, 1.2213400650864328,
1.35784200464962, 1.5065080226352798);
my $gettree = 1;
my $tree = '';
my %seqs = ();
my $seqname = '';
# process STDIN line by line
while(<>)
{
# remove newline and flanking whitespace
chomp;
s/^\s+//;
s/\s+$//;
#check to see if the programs should shift to reading sequences
$gettree = 0 if($gettree && m/^>/);
if($gettree)
{
$tree .= $_;
}
#check to see if the line is a fasta sequence name
elsif(/^>\s*(.+)\s*/)
{
$seqname = $1;
$seqs{$seqname} ||= '';
}
#append the line to the sequence
else
{
$seqs{$seqname} .= $_;
}
}
#calculate the total length of the tree by extracting all the branch lengths
my $totlen = 0;
$totlen += $1 while($tree =~ m/:([\d\.]+)/g);
#convert sequences to a table
my @table = values(%seqs);
my %gaps = ();
my $avgL = 0;
#find unique gaps and count the ungapped length of the sequences
foreach(@table)
{
while(/([-=]+)/g)
{
my $m = $1;
my $e = pos;
my $b = $e - length($m);
$gaps{$b, $e} = 1;
}
while(/([+=]+)/g)
{
my $m = $1;
my $e = pos;
my $b = $e - length($m);
$gaps{$b, $e} = 1;
}
$avgL += tr/ACGTacgt//;
}
#find average length
$avgL /= @table;
#count gaps
my $numgaps = keys(%gaps);
#calculate lambda estimate
my $lambda = $numgaps/$avgL/$totlen;
#calculate distribution of gaps sizes
my %gapsizes = ();
my $maxgap = 0;
my $suml = 0;
foreach(keys(%gaps))
{
my ($f, $l) = split(/$;/);
$l -= $f;
$gapsizes{$l} ||=0;
$gapsizes{$l}++;
$maxgap = $l if($l > $maxgap);
$suml += $l;
}
my @gapnum = map {exists($gapsizes{$_}) ? $gapsizes{$_} : 0 } (1..$maxgap);
my @xsqbins = ();
for(my $s = 1; $s <= $maxgap; $s *= 2)
{
push(@xsqbins, $s*2);
}
my @xsq_ob = ();
my $x = 1;
foreach my $xm (@xsqbins)
{
my $bc = 0;
for(;$x<$xm && $x<=$maxgap;++$x) {$bc += $gapnum[$x-1];}
push(@xsq_ob, $bc);
}
my $avgG = $suml/$numgaps-1.0;
my $gapclasses = keys(%gapsizes);
my %model = ();
# Geometric Model
#MLE of q
my $q = $avgG/($avgG+1.0);
#calculate LL(q)
my $LL = $numgaps*(log(1.0-$q)-log($q))+$suml * log($q);
#calculate BIC
my $bic = -2*$LL+log($numgaps)*1.0;
my $aic = -2*$LL+2*1.0;
#calculate xsq
my $xsq = 0.0;
my $df = $gapclasses-2;
while(my($g,$n) = each(%gapsizes))
{
my $e = $numgaps*((1.0-$q) * $q**($g-1.0));
$xsq += ($n-$e)**2/$e;
}
$xsq = 0.0;
$df = 0;
$x = 1;
my @xsq_ex = ();
foreach my $xm (@xsqbins)
{
my $bc = 0;
for(;$x<$xm && $x<=$maxgap;++$x) {$bc += ((1.0-$q) * $q**($x-1.0));}
push(@xsq_ex, $numgaps*$bc);
}
for(0..$#xsq_ex)
{
next unless($xsq_ob[$_]);
$df += 1;
$xsq += ($xsq_ob[$_]-$xsq_ex[$_])**2/$xsq_ex[$_];
}
$df -= 2;
#calculate rsq
my @L1 = ();
my @L2 = ();
while(my($g,$n) = each(%gapsizes))
{
push(@L1, log($n) - log($numgaps));
push(@L2, log(1-$q)+log($q)*($g-1));
}
my $r = rsq([@L1],[@L2]);
$model{Geometric} = {AIC => $aic, BIC => $bic, LL => $LL, DF => $df, XSQ => $xsq, Params => {'q' => $q}, text => "GapModel = \"NB\"\nGapParams = {1, $q}", f => 'f(x) = (1-q)*q^(x-1)', RSQ => $r};
# Zipf's Power-Law Model
my $plmle = 0;
for(0..$#gapnum)
{
$plmle += $gapnum[$_]*log($_+1);
}
$plmle /= -$numgaps;
my $i = 0;
while($i < @ZetaRatioTable && $ZetaRatioTable[$i] < $plmle)
{
$i++;
}
#if $i == 0, undefined
if($i == 0)
{
$model{PowerLaw} = {AIC => 'NA', BIC => 'NA', LL => 'NA', DF => 'NA',
XSQ => 'NA', text => '',
Params => {a => '<'.$ZetaTableS, z => $plmle},
f => 'f(x) = x^(-a)/Zeta(a)', RSQ => 'NA'};
}
elsif($i == @ZetaRatioTable)
{
$model{PowerLaw} = {AIC => 'NA', BIC => 'NA', LL => 'NA', DF => 'NA', XSQ => 'NA', test => '',
Params => {a => '<'.$ZetaTableS+$ZetaTableI*@ZetaRatioTable, z => $plmle},
f => 'f(x) = x^(-a)/Zeta(a)', RSQ => 'NA'};
}
else
{
my $x1 = $ZetaRatioTable[$i-1];
my $x2 = $ZetaRatioTable[$i];
my $y1 = ($i-1)*$ZetaTableI+$ZetaTableS;
my $y2 = $y1+$ZetaTableI;
my $a = $y1+($plmle-$x1)*($y2-$y1)/($x2-$x1);
$x1 = $ZetaTable[$i-1];
$x2 = $ZetaTable[$i];
my $b = $y1+($plmle-$x1)*($y2-$y1)/($x2-$x1);
$LL = 0;
$LL += log($_) * $gapsizes{$_} foreach(keys(%gapsizes));
$LL = -$numgaps * log($b) - $a * $LL;
$bic = -2*$LL+log($numgaps)*2.0;
$df = $gapclasses-2;
$aic = -2*$LL+2*1.0;
$xsq = 0.0;
my $S = 0;
my $F = 0;
for(0..$#gapnum)
{
my $n = $gapnum[$_];
next unless($n);
my $e = $numgaps*($_+1)**-$a/$b;
$xsq += ($n-$e)**2/$e;
}
$xsq = 0.0;
$df = 0;
$x = 1;
@xsq_ex = ();
foreach my $xm (@xsqbins)
{
my $bc = 0;
for(;$x<$xm && $x<=$maxgap;++$x) {$bc += $x**-$a/$b;}
push(@xsq_ex, $numgaps*$bc);
}
for(0..$#xsq_ex)
{
next unless($xsq_ob[$_]);
$df += 1;
$xsq += ($xsq_ob[$_]-$xsq_ex[$_])**2/$xsq_ex[$_];
}
$df -= 2;
@L1 = ();
@L2 = ();
while(my($g,$n) = each(%gapsizes))
{
push(@L1, log($n) - log($numgaps));
push(@L2, -$a * log($g) - log($b));
}
$r = rsq([@L1],[@L2]);
my $avgLi = int($avgL);
$model{PowerLaw} = {AIC => $aic, BIC => $bic, LL => $LL, DF => $df,
XSQ => $xsq, Params => {a => $a, z => $plmle},
text => "GapModel = \"PL\"\nGapParams = {$a, $avgLi}",
f => 'f(x) = x^(-a)/Zeta(a)', RSQ => $r};
}
#output
if($user_dist)
{
print "\nGap Size Distribution:\n";
print join("\t", $_, $gapsizes{$_} || 0, ($gapsizes{$_} || 0)/$numgaps), "\n" foreach(1..$maxgap);
}
if($chisq_bins)
{
print "\nChi-Squared Bins:\n";
my $xi = 1;
foreach(@xsq_ob)
{
print "\t$xi-";
$xi*=2;
print $xi-1, "\t$_\n";
}
}
print "# Total Tree Length is $totlen.\n";
print "Tree = $tree\n";
print "\n# Average Sequence Length is $avgL.\n";
print "Length = $avgL.\n";
print "\n# Lambda Estimate is $lambda.\n";
print "Lambda = $lambda\n";
print "\n# Number of gaps is $numgaps.\n";
print "# Average Gap Size is ", $avgG + 1.0, ".\n";
print "\n# Estimated Models\n";
foreach my $k (sort(keys(%model)))
{
print "# $k:\n# $model{$k}->{f}\n# ";
my @par = ();
foreach my $p (sort(keys(%{$model{$k}->{Params}})))
{
push(@par, "$p = $model{$k}->{Params}->{$p}");
}
print join(', ', @par), "\n";
print "# LogLik = $model{$k}->{LL}\n";
print "# AIC = $model{$k}->{AIC}\n";
print "# BIC = $model{$k}->{BIC}\n";
print "# Xsq = $model{$k}->{XSQ} ($model{$k}->{DF} df)\n";
#print "# Rsq of Log = $model{$k}->{RSQ}\n";
my $text = $model{$k}->{text};
$text =~ s/^/# /mg;
print $text, "\n";
}
my $k_ll;
foreach my $k(keys %model)
{
$k_ll = $k if( !(defined $k_ll) || $model{$k_ll}->{LL} < $model{$k}->{LL});
}
print "\n# Most Likely Model\n";
print $model{$k_ll}->{text}, "\n";
sub rsq
{
my ($rO, $rE) = @_;
my @obs = @$rO;
my @est = @$rE;
my $m = 0.0;
my $m2 = 0.0;
my $SSe = 0.0;
for my $i (0..$#obs)
{
$SSe += ($obs[$i] - $est[$i])*($obs[$i] - $est[$i]);
$m += $obs[$i];
$m2 += $obs[$i]*$obs[$i];
}
my $SSt = $m2-$m*$m/@obs;
return "NA" if($SSt == 0);
return 1.0 - $SSe/$SSt;
}
You can’t perform that action at this time.