From 73e0e9038152c63a5c6e9bf82ac0d6d61cb461c9 Mon Sep 17 00:00:00 2001 From: Viktor Gal Date: Thu, 8 Mar 2018 15:30:48 +0100 Subject: [PATCH] remove the bullshit --- applications/arts/__init__.py | 0 applications/arts/arts.py | 67 -- applications/arts/convert_artsmat.m | 82 -- applications/arts/data | 1 - applications/arts/genomic.py | 152 ---- applications/arts/signal_sensor.py | 225 ----- applications/arts/util.py | 82 -- applications/asp/LICENSE | 340 -------- applications/asp/NEWS | 11 - applications/asp/README | 45 - applications/asp/__init__.py | 0 applications/asp/asp | 319 -------- applications/asp/data | 1 - applications/asp/galaxy/asp.sh | 43 - applications/asp/galaxy/asp.xml | 115 --- applications/asp/genomic.py | 184 ----- applications/asp/model.py | 143 ---- applications/asp/seqdict.py | 68 -- applications/asp/signal_detectors.py | 172 ---- .../evaluate_multiclass_labels.py | 72 -- .../classification/predict_multiclass_svm.py | 75 -- .../random_fourier_classification.cpp | 175 ---- .../random_fourier_classification.py | 100 --- .../classification/train_multiclass_svm.py | 98 --- applications/classification/utils.py | 51 -- applications/easysvm/AUTHORS | 3 - applications/easysvm/LICENSE | 674 --------------- applications/easysvm/MANIFEST.in | 7 - applications/easysvm/README | 142 ---- applications/easysvm/data | 1 - applications/easysvm/distutils-help.txt | 73 -- applications/easysvm/esvm/__init__.py | 0 applications/easysvm/esvm/datafuncs.py | 162 ---- applications/easysvm/esvm/experiment.py | 773 ------------------ applications/easysvm/esvm/mldata.py | 300 ------- applications/easysvm/esvm/mldata_arff.py | 115 --- applications/easysvm/esvm/parse.py | 232 ------ applications/easysvm/esvm/plots.py | 226 ----- applications/easysvm/esvm/poim.py | 128 --- applications/easysvm/esvm/utils.py | 180 ---- applications/easysvm/galaxy/CloudGen.xml | 53 -- applications/easysvm/galaxy/FastaGen.xml | 52 -- applications/easysvm/galaxy/MotifGen.xml | 91 --- applications/easysvm/galaxy/README | 3 - applications/easysvm/galaxy/easysvm.xml | 316 ------- applications/easysvm/galaxy/eval.xml | 85 -- applications/easysvm/galaxy/modelsel.xml | 251 ------ applications/easysvm/galaxy/poim.xml | 106 --- applications/easysvm/scripts/datagen.py | 103 --- applications/easysvm/scripts/easysvm.py | 51 -- applications/easysvm/setup.py | 84 -- applications/easysvm/splicesites/__init__.py | 0 applications/easysvm/splicesites/test_gc.py | 50 -- .../easysvm/splicesites/tutorial_example.py | 242 ------ applications/easysvm/splicesites/utils.py | 288 ------- .../easysvm/tutpaper/data/effect_of_c.data | 69 -- .../easysvm/tutpaper/data/nonlinear.data | 71 -- .../easysvm/tutpaper/data/small_gc_toy.data | 15 - .../tutpaper/data/small_gc_toy_outlier.data | 15 - applications/easysvm/tutpaper/svm_params.py | 377 --------- applications/msplicer/LICENSE | 340 -------- applications/msplicer/Makefile | 8 - applications/msplicer/NEWS | 6 - applications/msplicer/README | 110 --- applications/msplicer/content_sensors.py | 56 -- applications/msplicer/convert_mat.m | 213 ----- applications/msplicer/data | 1 - applications/msplicer/dna.fa | 74 -- applications/msplicer/genomic.py | 152 ---- applications/msplicer/model.py | 307 ------- applications/msplicer/msplicer | 355 -------- applications/msplicer/plif.py | 224 ----- applications/msplicer/seqdict.py | 68 -- applications/msplicer/signal_detectors.py | 164 ---- applications/ocr/Ai.py | 92 --- applications/ocr/FigureWidget.py | 147 ---- applications/ocr/MatrixWidget.py | 66 -- applications/ocr/QuadrWidget.py | 30 - applications/ocr/README | 9 - applications/ocr/common.py | 31 - applications/ocr/data | 1 - applications/ocr/predict | 203 ----- applications/ocr/train | 64 -- applications/tapkee/faces_embedding.py | 68 -- applications/tapkee/octave_ltsa.m | 11 - applications/tapkee/samples/data.py | 9 - applications/tapkee/samples/dm.py | 33 - applications/tapkee/samples/hlle.py | 22 - applications/tapkee/samples/isomap.py | 39 - applications/tapkee/samples/klle.py | 35 - applications/tapkee/samples/la.py | 33 - applications/tapkee/samples/lle.py | 28 - applications/tapkee/samples/lltsa.py | 22 - applications/tapkee/samples/lpp.py | 20 - applications/tapkee/samples/ltsa.py | 22 - applications/tapkee/samples/mds.py | 40 - applications/tapkee/samples/npe.py | 22 - applications/tapkee/swissroll_embedding.py | 88 -- applications/tapkee/words_embedding.py | 56 -- benchmarks/hasheddoc_benchmarks.cpp | 51 -- benchmarks/kernel_matrix_sum_benchmark.cpp | 111 --- benchmarks/rf_feats_benchmark.cpp | 127 --- benchmarks/rf_feats_kernel_comp.cpp | 136 --- benchmarks/sparse_test.cpp | 206 ----- 104 files changed, 11859 deletions(-) delete mode 100644 applications/arts/__init__.py delete mode 100755 applications/arts/arts.py delete mode 100644 applications/arts/convert_artsmat.m delete mode 120000 applications/arts/data delete mode 100644 applications/arts/genomic.py delete mode 100644 applications/arts/signal_sensor.py delete mode 100644 applications/arts/util.py delete mode 100644 applications/asp/LICENSE delete mode 100644 applications/asp/NEWS delete mode 100644 applications/asp/README delete mode 100644 applications/asp/__init__.py delete mode 100755 applications/asp/asp delete mode 120000 applications/asp/data delete mode 100644 applications/asp/galaxy/asp.sh delete mode 100644 applications/asp/galaxy/asp.xml delete mode 100644 applications/asp/genomic.py delete mode 100644 applications/asp/model.py delete mode 100644 applications/asp/seqdict.py delete mode 100644 applications/asp/signal_detectors.py delete mode 100644 applications/classification/evaluate_multiclass_labels.py delete mode 100644 applications/classification/predict_multiclass_svm.py delete mode 100644 applications/classification/random_fourier_classification.cpp delete mode 100644 applications/classification/random_fourier_classification.py delete mode 100644 applications/classification/train_multiclass_svm.py delete mode 100644 applications/classification/utils.py delete mode 100644 applications/easysvm/AUTHORS delete mode 100644 applications/easysvm/LICENSE delete mode 100644 applications/easysvm/MANIFEST.in delete mode 100644 applications/easysvm/README delete mode 120000 applications/easysvm/data delete mode 100644 applications/easysvm/distutils-help.txt delete mode 100644 applications/easysvm/esvm/__init__.py delete mode 100644 applications/easysvm/esvm/datafuncs.py delete mode 100644 applications/easysvm/esvm/experiment.py delete mode 100644 applications/easysvm/esvm/mldata.py delete mode 100644 applications/easysvm/esvm/mldata_arff.py delete mode 100644 applications/easysvm/esvm/parse.py delete mode 100644 applications/easysvm/esvm/plots.py delete mode 100644 applications/easysvm/esvm/poim.py delete mode 100644 applications/easysvm/esvm/utils.py delete mode 100644 applications/easysvm/galaxy/CloudGen.xml delete mode 100644 applications/easysvm/galaxy/FastaGen.xml delete mode 100644 applications/easysvm/galaxy/MotifGen.xml delete mode 100644 applications/easysvm/galaxy/README delete mode 100644 applications/easysvm/galaxy/easysvm.xml delete mode 100644 applications/easysvm/galaxy/eval.xml delete mode 100644 applications/easysvm/galaxy/modelsel.xml delete mode 100644 applications/easysvm/galaxy/poim.xml delete mode 100644 applications/easysvm/scripts/datagen.py delete mode 100644 applications/easysvm/scripts/easysvm.py delete mode 100755 applications/easysvm/setup.py delete mode 100644 applications/easysvm/splicesites/__init__.py delete mode 100644 applications/easysvm/splicesites/test_gc.py delete mode 100644 applications/easysvm/splicesites/tutorial_example.py delete mode 100644 applications/easysvm/splicesites/utils.py delete mode 100644 applications/easysvm/tutpaper/data/effect_of_c.data delete mode 100644 applications/easysvm/tutpaper/data/nonlinear.data delete mode 100644 applications/easysvm/tutpaper/data/small_gc_toy.data delete mode 100644 applications/easysvm/tutpaper/data/small_gc_toy_outlier.data delete mode 100644 applications/easysvm/tutpaper/svm_params.py delete mode 100644 applications/msplicer/LICENSE delete mode 100644 applications/msplicer/Makefile delete mode 100644 applications/msplicer/NEWS delete mode 100644 applications/msplicer/README delete mode 100644 applications/msplicer/content_sensors.py delete mode 100644 applications/msplicer/convert_mat.m delete mode 120000 applications/msplicer/data delete mode 100644 applications/msplicer/dna.fa delete mode 100644 applications/msplicer/genomic.py delete mode 100644 applications/msplicer/model.py delete mode 100755 applications/msplicer/msplicer delete mode 100644 applications/msplicer/plif.py delete mode 100644 applications/msplicer/seqdict.py delete mode 100644 applications/msplicer/signal_detectors.py delete mode 100644 applications/ocr/Ai.py delete mode 100644 applications/ocr/FigureWidget.py delete mode 100644 applications/ocr/MatrixWidget.py delete mode 100644 applications/ocr/QuadrWidget.py delete mode 100644 applications/ocr/README delete mode 100644 applications/ocr/common.py delete mode 120000 applications/ocr/data delete mode 100755 applications/ocr/predict delete mode 100755 applications/ocr/train delete mode 100644 applications/tapkee/faces_embedding.py delete mode 100644 applications/tapkee/octave_ltsa.m delete mode 100644 applications/tapkee/samples/data.py delete mode 100644 applications/tapkee/samples/dm.py delete mode 100644 applications/tapkee/samples/hlle.py delete mode 100644 applications/tapkee/samples/isomap.py delete mode 100644 applications/tapkee/samples/klle.py delete mode 100644 applications/tapkee/samples/la.py delete mode 100644 applications/tapkee/samples/lle.py delete mode 100644 applications/tapkee/samples/lltsa.py delete mode 100644 applications/tapkee/samples/lpp.py delete mode 100644 applications/tapkee/samples/ltsa.py delete mode 100644 applications/tapkee/samples/mds.py delete mode 100644 applications/tapkee/samples/npe.py delete mode 100644 applications/tapkee/swissroll_embedding.py delete mode 100644 applications/tapkee/words_embedding.py delete mode 100644 benchmarks/hasheddoc_benchmarks.cpp delete mode 100644 benchmarks/kernel_matrix_sum_benchmark.cpp delete mode 100644 benchmarks/rf_feats_benchmark.cpp delete mode 100644 benchmarks/rf_feats_kernel_comp.cpp delete mode 100644 benchmarks/sparse_test.cpp diff --git a/applications/arts/__init__.py b/applications/arts/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/applications/arts/arts.py b/applications/arts/arts.py deleted file mode 100755 index 469d40c37ae..00000000000 --- a/applications/arts/arts.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# Written (W) 2008-2009 Soeren Sonnenburg -# Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society - -import numpy -import os -import sys -import optparse -import bz2 -from signal_sensor import SignalSensor -from genomic import read_single_fasta - -arts_version = 'v0.3' -def_file = bz2.BZ2File('data/ARTS.dat.bz2') - -def print_version(): - sys.stderr.write('arts ' + arts_version + '\n') - -def parse_options(): - parser = optparse.OptionParser(usage="usage: %prog [options] seq.fa") - - parser.add_option("-o", "--outfile", type="str", default='stdout', - help="File to write the results to") - parser.add_option("-v", "--version", default=False, - help="Show some more information") - parser.add_option("--organism", type="str", default='Worm', - help="""use model for organism when predicting - (one of Cress, Fish, Fly, Human, Worm)""") - - (options, args) = parser.parse_args() - if options.version: - print_version() - sys.exit(0) - - if len(args) != 1: - parser.error("incorrect number of arguments") - - fafname = args[0] - if not os.path.isfile(fafname): - parser.error("fasta file does not exist") - - if options.outfile == 'stdout': - outfile = sys.stdout - else: - try: - outfile = file(options.outfile, 'w') - except IOError: - parser.error("could not open %s for writing" % options.outfile) - - return (fafname, outfile) - -if __name__ == '__main__': - (fafname, outfile) = parse_options() - seq = read_single_fasta(fafname) - - arts = SignalSensor() - arts.from_file(def_file) - preds = arts.predict(seq) - - for p in preds: - outfile.write('%+g\n' % p) diff --git a/applications/arts/convert_artsmat.m b/applications/arts/convert_artsmat.m deleted file mode 100644 index 2adefa1518e..00000000000 --- a/applications/arts/convert_artsmat.m +++ /dev/null @@ -1,82 +0,0 @@ -function convert_mat() - -%load('ARTS-info.mat'); -load('/home/sonne/stuff/ARTS-info.mat'); -targetname='ARTS.dat'; - -fid=fopen(targetname,'wb'); - -fprintf(fid, '%%arts version: 1.0\n\n'); - -idx=find(alphas~=0); -data=[trainData.xPos,trainData.xNeg]; -alphas=alphas(idx); -center=trainData.tssPosition; - -svs1=data(par.select1+center, idx); -svs2=data(par.select2+center, idx); -svs3=data(par.select3+center, idx); - -fprintf(fid, 'b=%e\n', b); -fprintf(fid, 'alphas='); -write_mat(fid, alphas); - -fprintf(fid, 'num_kernels=%d\n', 3); - -fprintf(fid, 'kernel_name1=%s\n', 'wdshift'); -fprintf(fid, 'kernel_left1=%d\n', min(par.select1)); -fprintf(fid, 'kernel_center1=%d\n', 0); -fprintf(fid, 'kernel_right1=%d\n', max(par.select1)); -fprintf(fid, 'kernel_order1=%d\n', par.order1); -fprintf(fid, 'kernel_shift1=%d\n', par.shift1); -fprintf(fid, 'kernel_svs1='); -write_string(fid, svs1); -fprintf(fid,'\n'); - -fprintf(fid, 'kernel_name2=%s\n', 'spectrum'); -fprintf(fid, 'kernel_left2=%d\n', min(par.select2)); -fprintf(fid, 'kernel_center2=%d\n', 0); -fprintf(fid, 'kernel_right2=%d\n', max(par.select2)); -fprintf(fid, 'kernel_order2=%d\n', par.wordLen2); -fprintf(fid, 'kernel_svs2='); -write_string(fid, svs2); -fprintf(fid,'\n'); - -fprintf(fid, 'kernel_name3=%s\n', 'spectrum'); -fprintf(fid, 'kernel_left3=%d\n', min(par.select3)); -fprintf(fid, 'kernel_center3=%d\n', 0); -fprintf(fid, 'kernel_right3=%d\n', max(par.select3)); -fprintf(fid, 'kernel_order3=%d\n', par.wordLen3); -fprintf(fid, 'kernel_svs3='); -write_string(fid, svs3); -fprintf(fid,'\n'); -fclose(fid); - -system(sprintf('bzip2 -9 "%s"\n', targetname)); - -function write_string(fid, x) - fprintf(fid, '[\n'); - for i=1:size(x,2), - fprintf(fid, '%c', x(1:(size(x,1)-1),i)); - fprintf(fid, '%c\n', x(size(x,1),i)); - end - fprintf(fid, ']\n'); - -function write_mat(fid, x) - if size(x,1)==1, - fprintf(fid, '['); - fprintf(fid, '%e, ', x(1:(length(x)-1))); - fprintf(fid, '%e', x(end)); - else - fprintf(fid, '['); - for i=1:size(x,2), - fprintf(fid, '%e, ', x(1:(size(x,1)-1),i)); - - if i'): - key = s[1:-1] - fasta[key] = "" - else: - fasta[key] += s[:-1] - - return fasta - -""" write dictionary fasta """ -def write_fasta(f, d, linelen=60): - for k in sorted(d): - f.write('>%s\n' % k); - s = d[k] - for i in xrange(0, len(s), linelen): - f.write(s[i:i + linelen] + '\n') - -def write_gff(f, (source, version), (seqtype, seqname), descrlist, skipheader=False): - """ writes a gff version 2 file - descrlist is a list of dictionaries, each of which contain these fields: - [attributes] [comments] - """ - - if not skipheader: - f.write('##gff-version 2\n') - f.write('##source-version %s %s\n' % (source, version)) - - t = time.localtime() - f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6]) - - f.write('##Type %s %s\n' % (seqtype, seqname)) - - for d in descrlist: - f.write('%s\t%s\t%s\t%d\t%d\t%+f\t%s\t%d' % (d['seqname'], d['source'], - d['feature'], d['start'], d['end'], - d['score'], d['strand'], d['frame'])) - if d.has_key('attributes'): - f.write('\t' + d['attributes']) - if d.has_key('comments'): - f.write('\t' + d['comments']) - f.write('\n') - - -if __name__ == '__main__': - import sys, os - - table = read_table_browser(file('/fml/ag-raetsch/home/sonne/addnet/tfbs/share/data/wt1_bibliosphere_table_browser_hg17.txt')) - print table.keys() - print table[table.keys()[0]] - d = { 'ahoernchen' : 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT', - 'bhoernchen' : 'GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA' } - - write_fasta(sys.stdout, d) - write_fasta(file('/tmp/test.fa', 'w'), d) - - d2 = read_fasta(file('/tmp/test.fa')) - os.unlink('/tmp/test.fa') - - print d - print d2 - print d == d2 - - p = load_genomic('chr5', '+', 100000, 100100, 'hg17') - n = load_genomic('chr1', '-', 3000000, 3001000, 'mm7') - write_single_fasta('bla.fa', 'bla', 'ACGT') - n2 = read_single_fasta('bla.fa') diff --git a/applications/arts/signal_sensor.py b/applications/arts/signal_sensor.py deleted file mode 100644 index 0498570457a..00000000000 --- a/applications/arts/signal_sensor.py +++ /dev/null @@ -1,225 +0,0 @@ -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2008 Soeren Sonnenburg -# Written (W) 2011 Christian Widmer -# Copyright (C) 2008-2011 Fraunhofer Institute FIRST and Max-Planck-Society - -import numpy -import sys - -from util import * - -from shogun import StringCharFeatures, StringWordFeatures, CombinedFeatures, DNA -from shogun import CombinedKernel, WeightedDegreePositionStringKernel -from shogun import K_COMMWORDSTRING, CommWordStringKernel, IdentityKernelNormalizer -from shogun import SortWordString -from shogun import KernelMachine - - -class Sensor(object): - """ - Sensor has window (left,center,right) of length right-left+1 - with center at "center" - """ - - def __init__(self, window=None, kernel=None, train_features=None): - self.kernel = kernel - self.window = window - self.train_features = train_features - self.preproc = None - - def from_file(self, file, num): - """ - parse lines with num as suffix, e.g. - - kernel_= - """ - l = file.readline() - - name = None - left = None - right = None - center = None - order = None - shift = None - svs = None - - while l: - if l.find('%d=' % num) > -1: - if name is None: name = parse_name(l, 'kernel_name%d' % num) - if left is None: left = parse_int(l, 'kernel_left%d' % num) - if right is None: right = parse_int(l, 'kernel_right%d' % num) - if center is None: center = parse_int(l, 'kernel_center%d' % num) - if order is None: order = parse_int(l, 'kernel_order%d' % num) - if shift is None: shift = parse_int(l, 'kernel_shift%d' % num) - if svs is None: svs = parse_string(l, file, 'kernel_svs%d' % num) - else: - self.window = (left, center, right) - return self.init_sensor({ 'name' : name, 'order': order, 'shift' : shift}, svs) - - l = file.readline() - - def init_sensor(self, kernel, svs): - f = StringCharFeatures(svs, DNA) - - kname = kernel['name'] - if kname == 'spectrum': - wf = StringWordFeatures(f.get_alphabet()) - wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) - - pre = SortWordString() - pre.init(wf) - wf.add_preprocessor(pre) - wf.apply_preprocessor() - f = wf - - k = CommWordStringKernel(0, False) - k.set_use_dict_diagonal_optimization(kernel['order'] < 8) - self.preproc = pre - - elif kname == 'wdshift': - k = WeightedDegreePositionStringKernel(0, kernel['order']) - k.set_normalizer(IdentityKernelNormalizer()) - k.set_shifts(kernel['shift'] * - numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) - k.set_position_weights(1.0 / f.get_max_vector_length() * - numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) - else: - raise "Currently, only wdshift and spectrum kernels supported" - - self.kernel = k - self.train_features = f - - return (self.kernel, self.train_features) - - def get_test_features(self, seq, window): - start = self.window[0] - window[0] - end = len(seq) - window[1] + self.window[2] - size = self.window[2] - self.window[0] + 1 - seq = seq[start:end] - seq = seq.replace("N", "A").replace("R", "A").replace("M", "A") - f = StringCharFeatures([seq], DNA) - - if self.preproc: - wf = StringWordFeatures(f.get_alphabet()) - o = self.train_features.get_order() - wf.obtain_from_char(f, 0, o, 0, False) - f = wf - f.obtain_by_sliding_window(size, 1, o - 1) - else: - f.obtain_by_sliding_window(size, 1) - - return f - -class SignalSensor(object): - """ - A collection of sensors - """ - def __init__(self): - self.sensors = list() - self.kernel = CombinedKernel() - self.svs = CombinedFeatures() - self.svm = None - self.window = (+100000, -1000000) - - def from_file(self, file): - sys.stderr.write('loading model file') - l = file.readline(); - - if l != '%arts version: 1.0\n': - sys.stderr.write("\nfile not an arts definition file\n") - return None - - bias = None - alphas = None - num_kernels = None - - while l: - # skip comment or empty line - if not (l.startswith('%') or l.startswith('\n')): - if bias is None: bias = parse_float(l, 'b') - if alphas is None: alphas = parse_vector(l, file, 'alphas') - if num_kernels is None: num_kernels = parse_int(l, 'num_kernels') - - if num_kernels and bias and alphas is not None: - for i in xrange(num_kernels): - s = Sensor() - (k, f) = s.from_file(file, i + 1) - k.io.enable_progress() - self.window = (min(self.window[0], s.window[0]), - max(self.window[1], s.window[2])) - self.sensors.append(s) - self.kernel.append_kernel(k) - self.svs.append_feature_obj(f) - - self.kernel.init(self.svs, self.svs) - self.svm = KernelMachine(self.kernel, alphas, - numpy.arange(len(alphas), dtype=numpy.int32), bias) - self.svm.io.set_target_to_stderr() - self.svm.io.enable_progress() - self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus()) - sys.stderr.write('done\n') - return - - l = file.readline() - - sys.stderr.write('error loading model file\n') - - - def predict(self, seq, chunk_size = int(10e6)): - """ - predicts on whole contig, splits up sequence in chunks of size chunk_size - """ - - seq_len = len(seq) - num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) - assert(num_chunks > 0) - - sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) - - start = 0 - stop = min(chunk_size, seq_len) - - out = [] - - # iterate over chunks - for chunk_idx in range(num_chunks): - - sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) - - assert (start < stop) - chunk = seq[start:stop] - - assert(len(self.sensors) > 0) - tf = CombinedFeatures() - for i in xrange(len(self.sensors)): - f = self.sensors[i].get_test_features(chunk, self.window) - tf.append_feature_obj(f) - - sys.stderr.write("initialising kernel...") - self.kernel.init(self.svs, tf) - sys.stderr.write("..done\n") - - self.svm.set_kernel(self.kernel) - lab_out = self.svm.apply().get_values() - - assert(len(lab_out) > 0) - out.extend(lab_out) - - # increment chunk - start = stop - stop = min(stop+chunk_size, seq_len) - - - l = (-self.window[0]) * [-42] - r = self.window[1] * [-42] - - # concatenate - ret = l + out + r - - assert(len(ret) == len(seq)) - - return ret diff --git a/applications/arts/util.py b/applications/arts/util.py deleted file mode 100644 index 9c8e653def8..00000000000 --- a/applications/arts/util.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2008 Soeren Sonnenburg -# Written (W) 2007 Gunnar Raetsch -# Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -import sys -from numpy import mat, array, inf, any, reshape, int32 - -def parse_name(line, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - return line[line.find('=') + 1:-1] - else: - return None - -def parse_int(line, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - return int(line[line.find('=') + 1:-1]) - else: - return None - -def parse_float(line, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - return float(line[line.find('=') + 1:-1]) - else: - return None - -def parse_vector(line, file, name): - mat = parse_matrix(line, file, name) - if mat is None: - return mat - else: - mat = array(mat).flatten() - return mat - -def parse_string(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - l = '' - lines = [] - while l is not None and l.find(']') < 0: - if l: - lines.append(l[:-1]) - l = file.readline() - - if l.find(']') < 0: - sys.stderr.write("string ended without ']'\n") - return None - else: - return lines - else: - return None - -def parse_matrix(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - if line.find(']') < 0: - l = '' - while l is not None and l.find(']') < 0: - line += l - l = file.readline() - if l is not None and l.find(']') >= 0: - line += l - - if line.find(']') < 0: - sys.stderr.write("matrix `" + name + "' ended without ']'\n") - return None - else: - mm = mat(line[line.find('['):line.find(']') + 1]) - if len(mm.shape) == 1: - mm = reshape(mm.shape[0], 1) - return mm - else: - return None diff --git a/applications/asp/LICENSE b/applications/asp/LICENSE deleted file mode 100644 index 5b6e7c66c27..00000000000 --- a/applications/asp/LICENSE +++ /dev/null @@ -1,340 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. diff --git a/applications/asp/NEWS b/applications/asp/NEWS deleted file mode 100644 index ce5724b8a19..00000000000 --- a/applications/asp/NEWS +++ /dev/null @@ -1,11 +0,0 @@ -2009-05-25 Soeren Sonnenburg - - * Asp version 0.2 - - Fixes for shogun 0.7.3 - - Several bugfixes (including input was shifted by one) - - Drastic speedups (e.g., predictions take ~10 minutes for Human ChrY) - -2008-10-14 Soeren Sonnenburg - - * Asp version 0.1 - - Initial release of the accurate splice site predictor diff --git a/applications/asp/README b/applications/asp/README deleted file mode 100644 index 5842a591eb0..00000000000 --- a/applications/asp/README +++ /dev/null @@ -1,45 +0,0 @@ -This is the accurate splicer (asp) program accompanying the paper -"Accurate Splice Site Prediction Using Support Vector Machines" -by Soeren Sonnenburg, Gabriele Schweikert, Petra Philips, -Jonas Behr and Gunnar Raetsch [1]. - - -ASP PROGRAM REQUIREMENTS: - -Asp requires a working python (2.4 or later) installation with numpy -(version 1.0 or later) and the shogun toolbox (version 0.7.3 or later) -- which is available from http://www.shogun-toolbox.org for Linux, MacOSX, -cygwin/win32. If you are running Debian GNU Linux, shogun 0.7.3 is available in -debian unstable http://packages.debian.org/unstable/science/shogun-python-modular. - -ASP PROGRAM RUNNING TIME AND MEMORY REQUIREMENTS: - -Asp requires about 100M of memory for short sequences. Memory requirements -don't grow much (a additional linear term w.r.t. the length of the input -sequence). On first run with a new model (see --model option below), -asp will load and decompress the .bz2 compressed model file and store it -as a python native pickle dump, which increases startup times a lot. -Due to the optimizations in [2] splice form prediction (layer 1) times -won't change much for many/long sequences. - -ASP PROGRAM USAGE: - -./asp fasta_file.fa - -This will read all entries in the .fa file and print a .gff file with the -predictions for each of the entries to stdout. One may optionally specify the -start and stop of the transcript via --start / --stop and -the model via --model one of worm, fly, cress, fish, human. - is zero based. - - -REFERENCES: - -[1] S. Sonnenburg, G. Schweikert, P. Philips, J. Behr and Gunnar Raetsch, - Accurate Splice Site Prediction, BMC Bioinformatics, Special Issue from NIPS workshop on - New Problems and Methods in Computational Biology Whistler, Canada, 18 December 2006}, - December, 2007, BMC Bioinformatics,8:(Suppl. 10):S7 - -[2] Sonnenburg, S, Rätsch, G, Schäfer, C, Schölkopf, B. Large Scale Multiple - Kernel Learning. Journal of Machine Learning Research,7:1531-1565, - July 2006, K.Bennett and E.P.-Hernandez Editors. diff --git a/applications/asp/__init__.py b/applications/asp/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/applications/asp/asp b/applications/asp/asp deleted file mode 100755 index e5f9d21eec4..00000000000 --- a/applications/asp/asp +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env python -""" -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -Written (W) 2007,2011 Gunnar Raetsch -Written (W) 2006-2009 Soeren Sonnenburg -Copyright (C) 2006-2010 Fraunhofer Institute FIRST and Max-Planck-Society -""" - -try: - import os - import os.path - import sys - import pickle - import bz2 - import numpy - import optparse - import array - import math - - import genomic - import model - import seqdict - import shogun - - d=shogun.WeightedDegreeStringKernel(1) - if (d.version.get_version_revision() < 2997): - print - print "ERROR: SHOGUN VERSION 0.6.2 or later required" - print - sys.exit(1) - from signal_detectors import signal_detectors -except ImportError: - print - print "ERROR IMPORTING MODULES, MAKE SURE YOU HAVE SHOGUN INSTALLED" - print - sys.exit(1) - -asp_version='v0.3' - -class asp: - def __init__(self): - self.model = None - self.signal = None - self.model_name = None - - def sigmoid_transform(self, x): - return 1/(1+math.exp(-(x+2))) ; - - def load_model(self, filename): - self.model_name = filename - f=None - picklefile=filename+'.pickle' - if os.path.isfile(picklefile): - self.model=pickle.load(file(picklefile)) - else: - if filename.endswith('.bz2'): - f=bz2.BZ2File(filename); - else: - f=file(filename); - - self.model=model.parse_file(f) - f.close() - - f=file(picklefile,'w') - pickle.dump(self.model, f) - f.close() - - self.signal=signal_detectors(self.model) - - def write_gff(self, outfile, preds, name, score_type, skipheader, strand): - genomic.write_gff_header(outfile, ('asp',asp_version + ' ' + self.model_name), - ('DNA', name)) - - for i in xrange(len(preds[0])): - d=dict() - d['seqname']=name - d['source']='asp' - d['feature']=preds[0][i] - d['start']=preds[1][i] - d['end']=preds[1][i]+1 - if score_type=='output': - d['score']=preds[2][i] - else: - d['score']=self.sigmoid_transform(preds[2][i]) - d['strand']=strand - d['frame']=0 - genomic.write_gff_line(outfile, d) - - def write_spf(self, outfile, preds, name, score_type, skipheader, strand): - genomic.write_spf_header(outfile, ('asp', asp_version + ' ' + self.model_name), - ('DNA', name)) - - for i in xrange(len(preds[0])): - d=dict() - d['seqname']=name - d['source']=score_type - if preds[0][i]=='AG': - d['feature']='acc' - if strand=='+': - d['position']=preds[1][i]+2 - else: - d['position']=preds[1][i]-1 - else: - d['feature']='don' - if strand=='+': - d['position']=preds[1][i] - else: - d['position']=preds[1][i]+1 - if score_type=='output': - d['score']=preds[2][i] - else: - d['score']=self.sigmoid_transform(preds[2][i]) - d['strand']=strand - genomic.write_spf_line(outfile, d) - - def write_binary(self, preds, site, strand, score_type, binary_out, binary_pos): - out=array.array('f') - if score_type=='output': - out.fromlist(preds[2]) - else: - outputs=[self.sigmoid_transform(o) for o in preds[2]] ; - out.fromlist(outputs) - - # move positions consistent with spf output - if site=='acc': - if strand=='+': - p=[i+2 for i in preds[1]] - else: - p=[i-1 for i in preds[1]] - else: - if strand=='+': - p=[i for i in preds[1]] - else: - p=[i+1 for i in preds[1]] - - pos=array.array('i') - pos.fromlist(p) ; - out.tofile(binary_out) - pos.tofile(binary_pos) - - - def predict_file(self, fname, (start,end), output_format, score_type, strand='+'): - skipheader=False - fasta_dict = genomic.read_fasta(file(fname)) - - if strand=='-': - for k, kseq in fasta_dict.ordered_items(): - fasta_dict[k]=genomic.reverse_complement(kseq) - - sys.stdout.write('found fasta file with ' + `len(fasta_dict)` + ' sequence(s) (strand=%s)\n' % strand) - seqs= seqdict.seqdict(fasta_dict, (start,end)) - - #get donor/acceptor signal predictions for all sequences - self.signal.predict_acceptor_sites_from_seqdict(seqs) - self.signal.predict_donor_sites_from_seqdict(seqs) - - contig_no = 0 ; - for seq in seqs: - contig_no = contig_no + 1 - - l=len(seq.preds['donor'].get_positions()) - p=[i+1 for i in seq.preds['donor'].get_positions()] - s=seq.preds['donor'].get_scores() - f=[] - for pos in p: - if seq.seq[pos-1:pos+1]=='GT': - f.append(('GT')) - else: - f.append(('GC')) - assert(seq.seq[pos-1:pos+1]=='GC') - - if strand=='-': - p=p[len(p)::-1] - p=[len(seq.seq)-i for i in p] - s=s[len(s)::-1] - f=f[len(f)::-1] - - don_preds=(f,p,s) - - l=len(seq.preds['acceptor'].get_positions()) - p=[i-1 for i in seq.preds['acceptor'].get_positions()] - s=seq.preds['acceptor'].get_scores() - f=l*['AG'] - - if strand=='-': - p=p[len(p)::-1] - p=[len(seq.seq)-i for i in p] - s=s[len(s)::-1] - f=f[len(f)::-1] - - acc_preds=(f,p,s) - - if output_format == 'binary': - assert(len(binary_basename)>0) - binary_out=file(binary_basename+'/acc/contig_%i%c.%s' % (contig_no, strand, score_type), 'w') - binary_pos=file(binary_basename+'/acc/contig_%i%c.pos' % (contig_no, strand), 'w') - self.write_binary(acc_preds, 'acc', strand, score_type, binary_out, binary_pos) - binary_out.close() - binary_pos.close() - binary_out=file(binary_basename+'/don/contig_%i%c.%s' % (contig_no, strand, score_type), 'w') - binary_pos=file(binary_basename+'/don/contig_%i%c.pos' % (contig_no, strand), 'w') - self.write_binary(don_preds, 'don', strand, score_type, binary_out, binary_pos) - binary_out.close() - binary_pos.close() - else: - if output_format == 'gff': - self.write_gff(outfile, acc_preds, seq.name, score_type, skipheader, strand) - self.write_gff(outfile, don_preds, seq.name, score_type, skipheader, strand) - else: - if output_format == 'spf': - self.write_spf(outfile, acc_preds, seq.name, score_type, skipheader, strand) - self.write_spf(outfile, don_preds, seq.name, score_type, skipheader, strand) - - -def print_version(): - sys.stdout.write('asp '+asp_version+'\n') - -def parse_options(): - parser = optparse.OptionParser(usage="usage: %prog [options] seq.fa") - - parser.add_option("-g", "--gff-file", type="str", - help="File to write the results in GFF format to the given file") - parser.add_option("-s", "--spf-file", type="str", default='stdout', - help="File to write the results in SPF format to the given file") - parser.add_option("-b", "--binary-basename", type="str", - help="Write results in binary format to file starting with this basename") - parser.add_option("-v", "--version", dest='version', default=False, action='store_true', - help="Show some more information") - parser.add_option("-t", "--transform", dest='transform', default=False, action='store_true', - help="Apply sigmoid transform to scale predictions between 0 and 1") - parser.add_option("--start", type="int", default=499, - help="coding start (zero based, relative to sequence start)") - parser.add_option("--stop", type="int", default=-499, - help="""coding stop (zero based, if positive relative to - sequence start, if negative relative to sequence end)""") - parser.add_option("--organism", type="str", default='Worm', - help="""use asp model for organism when predicting - (one of Cress, Fish, Fly, Human, Worm)""") - - (options, args) = parser.parse_args() - if options.version: - print_version() - sys.exit(0) - - score_type = 'output' - if options.transform!=False: - score_type = 'Conf_cum' ; - - if len(args) != 1: - parser.error("incorrect number of arguments") - - fafname=args[0] - if not os.path.isfile(fafname): - parser.error("fasta file does not exist") - - modelfname = 'data/%s.dat.bz2' % options.organism - print "loading model file " + modelfname, - - if not os.path.isfile(modelfname): - print "...not found!\n" - parser.error("""model should be one of: - -Cress, Fish, Fly, Human, Worm -""") - - if (options.gff_file and (options.spf_file!='stdout' or options.binary_basename)) or (options.spf_file!='stdout' and (options.gff_file or options.binary_basename)): - parser.error("Only one of the options --binary-basename, --spf-file, or --gff-file may be given") - - if (options.spf_file!='stdout' or (not options.binary_basename and not options.gff_file)): - output_format='spf' - outfile_fname = options.spf_file - if (options.gff_file): - output_format='gff' - outfile_fname = options.gff_file - if (options.binary_basename): - output_format='binary' - - if output_format!='binary': - if outfile_fname == 'stdout': - outfile=sys.stdout - else: - try: - outfile=file(outfile_fname,'w') - except IOError: - parser.error("could not open %s for writing" % outfile_fname) - - if output_format=='binary': - outfile = None - if os.system('mkdir -p %s/acc' % options.binary_basename) != 0: - parser.error("could not create directory %s/acc" % options.binary_basename) - if os.system('mkdir -p %s/don' % options.binary_basename) != 0: - parser.error("could not create directory %s/don" % options.binary_basename) - - if options.start<80: - parser.error("--start value must be >=80") - - if options.stop > 0 and options.start >= options.stop - 80: - parser.error("--stop value must be > start + 80") - - if options.stop < 0 and options.stop > -80: - parser.error("--stop value must be <= - 80") - - # shift the start and stop a bit - options.start -= 1 ; - options.stop -= 1 ; - - return ((options.start,options.stop), fafname, modelfname, output_format, score_type, outfile, options.binary_basename) - - -if __name__ == '__main__': - (startstop, fafname, modelfname, output_format, score_type, outfile, binary_basename ) = parse_options() - p=asp() - p.load_model(modelfname); - p.predict_file(fafname, startstop, output_format, score_type, '+') - p.predict_file(fafname, startstop, output_format, score_type, '-') diff --git a/applications/asp/data b/applications/asp/data deleted file mode 120000 index 4afdb0dff1c..00000000000 --- a/applications/asp/data +++ /dev/null @@ -1 +0,0 @@ -../../data/asp \ No newline at end of file diff --git a/applications/asp/galaxy/asp.sh b/applications/asp/galaxy/asp.sh deleted file mode 100644 index 00e0aa989ac..00000000000 --- a/applications/asp/galaxy/asp.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -set -e - -pwd - -cd /mnt/galaxyTools/tools/asp-0.3 -export LD_LIBRARY_PATH=/mnt/galaxyTools/tools/shogun-0.10.0/lib -export PYTHONPATH=/mnt/galaxyTools/tools/shogun-0.10.0/lib/python2.6/dist-packages - -if [ "$3" = "spf1" ] -then - if [ "${10}" = "yes" ] - then - ./asp $1 --organism=$2 -t -s $5 - else - ./asp $1 --organism=$2 -s $5 - fi -elif [ "$3" = "gff2" ] -then - if [ "${10}" = "yes" ] - then - ./asp $1 --organism=$2 -t -g $4 - else - ./asp $1 --organism=$2 -g $4 - fi -elif [ "$3" = "binary" ] -then - mkdir -p $6/pred - echo "This dataset contains acceptor splice site predictions in binary SPF format (for use with mGene, Palmapper, QPALMA)" > $7 - mkdir -p $8/pred - echo "This dataset contains donor splice site predictions in binary SPF format (for use with mGene, Palmapper, QPALMA)" > $9 - if [ "${10}" = "yes" ] - then - ./asp $1 --organism=$2 -t -b $6 - else - ./asp $1 --organism=$2 -b $6 - fi - mv $6/acc/* $6/pred/ - rmdir $6/acc - mv $6/don/* $8/pred - rmdir $6/don -fi diff --git a/applications/asp/galaxy/asp.xml b/applications/asp/galaxy/asp.xml deleted file mode 100644 index 4147de19de4..00000000000 --- a/applications/asp/galaxy/asp.xml +++ /dev/null @@ -1,115 +0,0 @@ - - Accurate splice site prediction - ./asp.sh - $fasta_input - $organism - $result_format - $spf_gff - $spf_spf - $acc_spf_binary.extra_files_path - $acc_spf_binary - $don_spf_binary.extra_files_path - $don_spf_binary - $sigmoid_transform - > $log_file - - - - - - - - - - - - - - - - - - - - - - - result_format=="gff2" - - - result_format=="spf1" - - - result_format=="binary" - - - result_format=="binary" - - - result_format=="binary" - - - - -**What it does** - -ASP_ predicts splice sites on genomic sequences of several species [1]. - -It takes a genomic sequence in FASTA format and for every position -that exhibits a splice site consensus dimer (AG for acceptor splice -sites, GT/GC for donor splice site) computes a score indicating how -likely the position is a splice site. ASP uses a window of 141 nt -around each position. Therefore, there are no predictions near the -boundaries. Predictions are performed on the forward and backward -strand for all sequences in the FASTA file. - -.. _ASP: http://www.fml.tuebingen.mpg.de/raetsch/suppl/splice - -**References:** - -[1] Soeren Sonnenburg, Gabriele Schweikert, Petra Philips, Jonas Behr, and Gunnar Raetsch: Accurate splice site prediction using support vector machines, BMC Bioinformatics 2007, 8(Suppl 10):S7. - ------- - -**Output:** - -The output is a file in GFF-like format which contains a prediction -score for each candidate location in the genomic input sequence. - -Each line contains the sequence name, the position in the sequence, -the information about which kind of splice site type (GT/GC/AG) is -considered and the SVM score. The GT and GC lines correspond to donor -splice site predictions (consensus sequence is either GT or GC). For -lines with AG, the line corresponds to acceptor splice site -predictions (AG consensus). The score is related to the likelihood -that a the given position is a splice site. The larger the score, the -more likely that there is a splice site. Scores above -1 (original SVM -scores) and above 0.7 (with sigmoid transformation) are quite likely -splice sites. - -The tool `SignalPredict` in the `mGene.web modules` section can also -predict splice sites for several organisms. Here the score is -appropriately transformed into a posterior probability that there is a -splice site at the predicted sites. These probabilities are easier to -interpret than the raw SVM outputs provided by ASP. - --------- - -.. class:: infomark - -**About formats** - -**SPF format** Signal Prediction format was designed for providing -information such as labels or predictions for specific genomic -locations (with strand information). It has requires exactly six -fields:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. signalName - possible choices include tss, tis, acc, don, cdsStop, cleave. - 3. scoreName - possible choices include label, output, Conf, Conf_Cum - 4. chromPos - The position in the chromosome. (The first base in a chromosome is numbered 1.) - 5. strand - Defines the strand - either '+' or '-'. - 6. score - The score between -infinity and infinity. If scoreName is 'label', then the score should be either -1 or 1. - - - diff --git a/applications/asp/genomic.py b/applications/asp/genomic.py deleted file mode 100644 index 6035ea9cd11..00000000000 --- a/applications/asp/genomic.py +++ /dev/null @@ -1,184 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2009 Soeren Sonnenburg -# Written (W) 2006-2007 Mikio Braun -# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society - -import time -from string import maketrans - -class ordered_dict(dict): - """ - Provide an ordered dictionary with chromosome identifiers. - """ - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - self._order = self.keys() - - def __setitem__(self, key, value): - dict.__setitem__(self, key, value) - if key in self._order: - self._order.remove(key) - self._order.append(key) - - def __delitem__(self, key): - dict.__delitem__(self, key) - self._order.remove(key) - - def ordered_items(self): - return [(key,self[key]) for key in self._order] - - -""" read a table browser ascii output file (http://genome.ucsc.edu/cgi-bin/hgTables) """ -def read_table_browser(f): - table=dict(); - for l in f.readlines(): - if not l.startswith('#'): - (name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,alignID)=l.split('\t') - exonStarts=[ int(i) for i in exonStarts.split(',')[:-1] ] - exonEnds=[ int(i) for i in exonEnds.split(',')[:-1] ] - - table[name]={ 'chrom': chrom, 'strand': strand, 'txStart': int(txStart), 'txEnd': int(txEnd), - 'cdsStart': int(cdsStart), 'cdsEnd': int(cdsEnd), 'exonCount': int(exonCount), 'exonStarts': exonStarts, - 'exonEnds': exonEnds, 'proteinID': proteinID, 'alignID': alignID[:-1] } - - return table - -""" get promoter region """ -def get_promoter_region(chromosome, strand, gene_start, gene_end, genome, length): - - if strand == '+': - return load_genomic(chromosome, strand, gene_start, gene_start+length, genome, one_based=False) - elif strand == '-': - return load_genomic(chromosome, strand, gene_end, gene_end+length, genome, one_based=False) - else: - print 'unknown strand' - return None - -""" reverse + complement a DNA sequence (only letters ACGT are translated!) - FIXME won't work with all the rest like y... """ -def reverse_complement(str): - t=maketrans('acgtACGT','tgcaTGCA') - return str[len(str)::-1].translate(t) - -""" works only with .fa files that contain a single entry """ -def read_single_fasta(fname): - str=file(fname).read() - str=str[str.index('\n')+1:].replace('\n','') - return str - -""" writes only single enty .fa files """ -def write_single_fasta(fname, name, str, linelen=60): - header= '>' + name + '\n' - f=file(fname,'a') - f.write(header) - for i in xrange(0,len(str),linelen): - f.write(str[i:i+linelen]+'\n') - f.close() - -""" read fasta as dictionary """ -def read_fasta(f): - fasta=ordered_dict() - fa="" - key=None - for s in f.readlines(): - if s.startswith('>'): - if fa and key: - fasta[key]=fa - key=s[1:-1] - fasta[key]="" - fa="" - else: - fa+=s[:-1] - - if fa and key: - fasta[key]=fa - - return fasta - -def write_fasta(f, d, linelen=60): - """ write dictionary fasta """ - for k in sorted(d): - f.write('>%s\n' % k); - s = d[k] - for i in xrange(0, len(s), linelen): - f.write(s[i:i+linelen] + '\n') - -def write_gff_header(f, (source, version), (seqtype, seqname)): - """ writes a gff version 2 file - descrlist is a list of dictionaries, each of which contain these fields: - [attributes] [comments] - """ - f.write('##gff-version 2\n') - f.write('##source-version %s %s\n' % (source, version) ) - - t=time.localtime() - f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6]) - - f.write('##Type %s %s\n' % (seqtype, seqname) ) - -def write_gff_line(f, descr): - d=descr - f.write('%s\t%s\t%s\t%d\t%d\t%f\t%s\t%d' % (d['seqname'], d['source'], - d['feature'], d['start'], d['end'], - d['score'], d['strand'], d['frame'])) - if d.has_key('attributes'): - f.write('\t' + d['attributes']) - if d.has_key('comments'): - f.write('\t' + d['comments']) - f.write('\n') - -def write_spf_header(f, (source, version), (seqtype, seqname)): - """ writes a gff version 2 file - descrlist is a list of dictionaries, each of which contain these fields: - [attributes] [comments] - """ - - f.write('##spf-version 1\n') - f.write('##source-version %s %s\n' % (source, version) ) - - t=time.localtime() - f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6]) - - f.write('##Type %s %s\n' % (seqtype, seqname) ) - -def write_spf_line(f, descr): - d=descr - f.write('%s\t%s\t%s\t%d\t%s\t%f' % (d['seqname'], d['source'], - d['feature'], d['position'], - d['strand'], d['score'])) - if d.has_key('attributes'): - f.write('\t' + d['attributes']) - if d.has_key('comments'): - f.write('\t' + d['comments']) - f.write('\n') - -def write_gff(f, (source, version), (seqtype, seqname), descrlist, skipheader=False): - """ writes a gff version 2 file - descrlist is a list of dictionaries, each of which contain these fields: - [attributes] [comments] - """ - - if not skipheader: - f.write('##gff-version 2\n') - f.write('##source-version %s %s\n' % (source, version) ) - - t=time.localtime() - f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6]) - - f.write('##Type %s %s\n' % (seqtype, seqname) ) - - for d in descrlist: - f.write('%s\t%s\t%s\t%d\t%d\t%f\t%s\t%d' % (d['seqname'], d['source'], - d['feature'], d['start'], d['end'], - d['score'], d['strand'], d['frame'])) - if d.has_key('attributes'): - f.write('\t' + d['attributes']) - if d.has_key('comments'): - f.write('\t' + d['comments']) - f.write('\n') - diff --git a/applications/asp/model.py b/applications/asp/model.py deleted file mode 100644 index 1410bbcfc62..00000000000 --- a/applications/asp/model.py +++ /dev/null @@ -1,143 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2008 Soeren Sonnenburg -# Written (W) 2007 Gunnar Raetsch -# Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -import sys -from numpy import mat,array,inf,any,reshape,int32 - -class model(object): - #acceptor - acc_splice_b=None - acc_splice_order=None - acc_splice_window_left=None - acc_splice_window_right=None - acc_splice_alphas=None - acc_splice_svs=None - - #donor - don_splice_b=None - don_splice_order=None - don_splice_window_left=None - don_splice_window_right=None - don_splice_alphas=None - don_splice_svs=None - -def parse_file(file): - m=model() - - l=file.readline(); - - if l != '%asplicer definition file version: 1.0\n': - sys.stdout.write("\nfile not a asplicer definition file\n") - return None - - while l: - if not ( l.startswith('%') or l.startswith('\n') ): # comment - - #acceptor - if m.acc_splice_b is None: m.acc_splice_b=parse_value(l, 'acc_splice_b') - if m.acc_splice_order is None: m.acc_splice_order=parse_value(l, 'acc_splice_order') - if m.acc_splice_window_left is None: m.acc_splice_window_left=parse_value(l, 'acc_splice_window_left') - if m.acc_splice_window_right is None: m.acc_splice_window_right=parse_value(l, 'acc_splice_window_right') - if m.acc_splice_alphas is None: m.acc_splice_alphas=parse_vector(l, file, 'acc_splice_alphas') - if m.acc_splice_svs is None: m.acc_splice_svs=parse_string(l, file, 'acc_splice_svs') - - #donor - if m.don_splice_b is None: m.don_splice_b=parse_value(l, 'don_splice_b') - if m.don_splice_order is None: m.don_splice_order=parse_value(l, 'don_splice_order') - if m.don_splice_window_left is None: m.don_splice_window_left=parse_value(l, 'don_splice_window_left') - if m.don_splice_window_right is None: m.don_splice_window_right=parse_value(l, 'don_splice_window_right') - if m.don_splice_alphas is None: m.don_splice_alphas=parse_vector(l, file, 'don_splice_alphas') - if m.don_splice_svs is None: m.don_splice_svs=parse_string(l, file, 'don_splice_svs') - - l=file.readline() - - sys.stdout.write('done\n') - return m - -def parse_value(line, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - return float(line[line.find('=')+1:-1]) - else: - return None - -def parse_vector(line, file, name): - mat = parse_matrix(line, file, name) - if mat is None: - return mat - else: - mat = array(mat).flatten() - return mat - -def parse_matrix(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - if line.find(']') < 0: - l='' - while l is not None and l.find(']') < 0: - line+=l - l=file.readline() - if l is not None and l.find(']') >= 0: - line+=l - - if line.find(']') < 0: - sys.stdout.write("matrix `" + name + "' ended without ']'\n") - return None - else: - mm = mat(line[line.find('['):line.find(']')+1]) - if len(mm.shape)==1: - mm = reshape(mm.shape[0],1) - return mm - else: - return None - -def parse_string(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - l='' - lines=[] - while l is not None and l.find(']') < 0: - if l: - lines.append(l[:-1]) - l=file.readline() - - if l.find(']') < 0: - sys.stdout.write("string ended without ']'\n") - return None - else: - return lines - else: - return None - -if __name__ == '__main__': - import bz2 - import sys - import hotshot, hotshot.stats - - def load(): - f=file('data/asp_test.dat'); - m=parse_file(f); - - print m.acc_splice_b is None - print m.acc_splice_order is None - print m.acc_splice_window_left is None - print m.acc_splice_window_right is None - print m.acc_splice_alphas is None - print m.acc_splice_svs is None - - print m.don_splice_b is None - print m.don_splice_order is None - print m.don_splice_window_left is None - print m.don_splice_window_right is None - print m.don_splice_alphas is None - print m.don_splice_svs is None - - load() diff --git a/applications/asp/seqdict.py b/applications/asp/seqdict.py deleted file mode 100644 index 74d87a769b6..00000000000 --- a/applications/asp/seqdict.py +++ /dev/null @@ -1,68 +0,0 @@ -import string - -class predictions(object): - def __init__(self, positions=None, scores=None): - self.positions=positions - self.scores=scores - - def set_positions(self, positions): - self.positions=positions; - def get_positions(self): - return self.positions - - def set_scores(self, scores): - self.scores=scores - def get_scores(self): - return self.scores - - def __str__(self): - return 'positions: ' + `self.positions` + 'scores: ' + `self.scores` - def __repr__(self): - return self.__str__() - -class sequence(object): - def __init__(self, name, seq, (start,end)): - assert(start-1: - if lself.window_left: - positions.append(l+self.offset) - l=sequence.find(cons, l+1) - - positions.sort() - return positions - - def get_predictions_from_seqdict(self, seqdic, site): - """ we need to generate a huge test features object - containing all locations found in each seqdict-sequence - and each location (this is necessary to efficiently - (==fast,low memory) compute the splice outputs - """ - - seqlen=self.window_right+self.window_left+2 - - for s in seqdic: - position_list=DynamicIntArray() - - sequence=s.seq - positions=s.preds[site].positions - for j in xrange(len(positions)): - i=positions[j] - self.offset -self.window_left - position_list.append_element(i) - - t=StringCharFeatures([sequence], DNA) - t.obtain_by_position_list(seqlen, position_list) - self.wd_kernel.init(self.traindat, t) - - self.wd_kernel.io.enable_progress() - l=self.svm.apply().get_values() - self.wd_kernel.cleanup() - sys.stdout.write("\n...done...\n") - - num=len(s.preds[site].positions) - scores= num * [0] - for j in xrange(num): - scores[j]=l[j] - s.preds[site].set_scores(scores) - - def get_positions_from_seqdict(self, seqdic, site): - - for d in seqdic: - positions=list() - sequence=d.seq - for cons in self.consensus: - l=sequence.find(cons) - while l>-1: - if lself.window_left: - positions.append(l+self.offset) - l=sequence.find(cons, l+1) - positions.sort() - d.preds[site].set_positions(positions) - - def get_predictions(self, sequence, positions): - - seqlen=self.window_right+self.window_left+2 - num=len(positions) - - position_list=DynamicIntArray() - - for j in xrange(num): - i=positions[j] - self.offset - self.window_left - position_list.append_element(i) - - t=StringCharFeatures([sequence], DNA) - t.obtain_by_position_list(seqlen, position_list) - self.wd_kernel.init(self.traindat, t) - del t - - self.wd_kernel.io.enable_progress() - l=self.svm.apply().get_values() - self.wd_kernel.cleanup() - sys.stdout.write("\n...done...\n") - return l - -class signal_detectors(object): - def __init__(self, model): - don_consensus=['GC','GT'] - - self.acceptor=svm_splice_model(model.acc_splice_order, model.acc_splice_svs, - numpy.array(model.acc_splice_alphas).flatten(), model.acc_splice_b, - (model.acc_splice_window_left-2, 2, model.acc_splice_window_right+2), ['AG']) - self.donor=svm_splice_model(model.don_splice_order, model.don_splice_svs, - numpy.array(model.don_splice_alphas).flatten(), model.don_splice_b, - (model.don_splice_window_left+1, 0, model.don_splice_window_right-1), - don_consensus) - - def set_sequence(self, seq): - self.acceptor.set_sequence(seq) - self.donor.set_sequence(seq) - - def predict_acceptor_sites(self, seq): - pos=self.acceptor.get_positions(seq) - sys.stdout.write("computing svm output for acceptor positions\n") - pred=self.acceptor.get_predictions(seq, pos) - return (pos,pred) - - def predict_donor_sites(self,seq): - pos=self.donor.get_positions(seq) - sys.stdout.write("computing svm output for donor positions\n") - pred=self.donor.get_predictions(seq, pos) - return (pos,pred) - - def predict_acceptor_sites_from_seqdict(self, seqs): - self.acceptor.get_positions_from_seqdict(seqs, 'acceptor') - sys.stdout.write("computing svm output for acceptor positions\n") - self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor') - - def predict_donor_sites_from_seqdict(self, seqs): - self.donor.get_positions_from_seqdict(seqs, 'donor') - sys.stdout.write("computing svm output for donor positions\n") - self.donor.get_predictions_from_seqdict(seqs, 'donor') - - def clear_acceptor(): - del self.acceptor - self.acceptor=None - - def clear_donor(): - del self.acceptor - self.acceptor=None diff --git a/applications/classification/evaluate_multiclass_labels.py b/applications/classification/evaluate_multiclass_labels.py deleted file mode 100644 index f720bb687d0..00000000000 --- a/applications/classification/evaluate_multiclass_labels.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) The Shogun Machine Learning Toolbox -# Written (w) 2014 Daniel Pyrathon -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# The views and conclusions contained in the software and documentation are those -# of the authors and should not be interpreted as representing official policies, -# either expressed or implied, of the Shogun Development Team. - - -import argparse -import logging -import numpy as np -from shogun import (LibSVMFile, MulticlassLabels, MulticlassAccuracy) -from utils import get_features_and_labels - -LOGGER = logging.getLogger(__file__) - -def parse_arguments(): - parser = argparse.ArgumentParser(description="Evaluate predicted \ - labels againsy bare truth") - parser.add_argument('--actual', required=True, type=str, - help='Path to LibSVM dataset.') - parser.add_argument('--predicted', required=True, type=str, - help='Path to serialized predicted labels') - return parser.parse_args() - - -def main(actual, predicted): - LOGGER.info("SVM Multiclass evaluator") - - # Load SVMLight dataset - feats, labels = get_features_and_labels(LibSVMFile(actual)) - - # Load predicted labels - with open(predicted, 'r') as f: - predicted_labels_arr = np.array([float(l) for l in f]) - predicted_labels = MulticlassLabels(predicted_labels_arr) - - # Evaluate accuracy - multiclass_measures = MulticlassAccuracy() - LOGGER.info("Accuracy = %s" % multiclass_measures.evaluate( - labels, predicted_labels)) - LOGGER.info("Confusion matrix:") - res = multiclass_measures.get_confusion_matrix(labels, predicted_labels) - print res - - -if __name__ == '__main__': - args = parse_arguments() - main(args.actual, args.predicted) diff --git a/applications/classification/predict_multiclass_svm.py b/applications/classification/predict_multiclass_svm.py deleted file mode 100644 index 79585ec6784..00000000000 --- a/applications/classification/predict_multiclass_svm.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) The Shogun Machine Learning Toolbox -# Written (w) 2014 Daniel Pyrathon -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# The views and conclusions contained in the software and documentation are those -# of the authors and should not be interpreted as representing official policies, -# either expressed or implied, of the Shogun Development Team. - - -import argparse -import logging -from contextlib import closing -from shogun import (LibSVMFile, SparseRealFeatures, MulticlassLabels, - MulticlassLibSVM, SerializableHdf5File, - MulticlassAccuracy) -from utils import get_features_and_labels - -LOGGER = logging.getLogger(__file__) - -def parse_arguments(): - parser = argparse.ArgumentParser(description="Test a serialized SVM \ - classifier agains a SVMLight test file") - parser.add_argument('--classifier', required=True, type=str, - help='Path to training dataset in LibSVM format.') - parser.add_argument('--testset', required=True, type=str, - help='Path to the SVMLight test file') - parser.add_argument('--output', required=True, type=str, - help='File path to write predicted labels') - return parser.parse_args() - - -def main(classifier, testset, output): - LOGGER.info("SVM Multiclass evaluation") - - svm = MulticlassLibSVM() - serialized_classifier = SerializableHdf5File(classifier, 'r') - with closing(serialized_classifier): - svm.load_serializable(serialized_classifier) - - test_feats, test_labels = get_features_and_labels(LibSVMFile(testset)) - predicted_labels = svm.apply(test_feats) - - with open(output, 'w') as f: - for cls in predicted_labels.get_labels(): - f.write("%s\n" % int(cls)) - - LOGGER.info("Predicted labels saved in: '%s'" % output) - - -if __name__ == '__main__': - args = parse_arguments() - main(args.classifier, args.testset, args.output) - diff --git a/applications/classification/random_fourier_classification.cpp b/applications/classification/random_fourier_classification.cpp deleted file mode 100644 index bcd085407c7..00000000000 --- a/applications/classification/random_fourier_classification.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Björn Esser, Evangelos Anagnostopoulos - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace shogun; - -const char* filepath = 0; -const char* testpath = 0; -int32_t D = 300; -float64_t C = 0.1; -float64_t epsilon = 0.01; -float64_t width = 8; -int32_t correct_dimension = -1; - -SGSparseMatrix load_data(const char* filepath, float64_t*& label_vec) -{ - FILE* data_file = fopen(filepath, "r"); - SGSparseMatrix sparse_data; - - CLibSVMFile* file_reader = new CLibSVMFile(data_file); - file_reader->get_sparse_matrix(sparse_data.sparse_matrix, sparse_data.num_features, sparse_data.num_vectors, - label_vec); - - if (correct_dimension!=-1) - sparse_data.num_features = correct_dimension; - - SG_UNREF(file_reader); - - return sparse_data; -} - -void print_help_message() -{ - SG_SPRINT("Usage : ./rf_classify --dataset path_to_data [--testset path_to_test_data] [-D number_of_samples]\n"); - SG_SPRINT(" [-C C_for_SVM] [--epsilon SVM_epsilon] [--width gaussian_kernel_width] [--dimension feature_dimension]\n"); - SG_SPRINT("\nPerforms binary classification on provided data using Random Fourier features with a linear SVM solver,\n"); - SG_SPRINT("namely SVMOcas.\nParameter explanation :\n"); - SG_SPRINT("\ndataset : Path to data in LibSVM format. Required."); - SG_SPRINT("\ntestset : Path to test data in LibSVM format. Optional."); - SG_SPRINT("\nD : Number of samples for the Random Fourier features. Default value = 300"); - SG_SPRINT("\nC : SVM parameter C. Default value = 0.1"); - SG_SPRINT("\nepsilon : SVM epsilon. Default value = 0.01"); - SG_SPRINT("\nwidth : Gaussian Kernel width parameter. Default value = 8"); - SG_SPRINT("\ndimension : Correct feature dimension. Optional\n"); -} - -void parse_arguments(int argv, char** argc) -{ - if (argv%2!=1) - { - print_help_message(); - exit_shogun(); - exit(0); - } - - for (index_t i=1; i sparse_data = load_data(filepath, label_vec); - SGVector label(label_vec, sparse_data.num_vectors); - - - /** Creating features */ - CBinaryLabels* labels = new CBinaryLabels(label); - SG_REF(labels); - - CSparseFeatures* s_feats = new CSparseFeatures(sparse_data); - SGVector params(1); - params[0] = width; - CRandomFourierDotFeatures* r_feats = new CRandomFourierDotFeatures( - s_feats, D, KernelName::GAUSSIAN, params); - - - /** Training */ - CLibLinear* svm = new CLibLinear(C, r_feats, labels); - //CSVMOcas* svm = new CSVMOcas(C, r_feats, labels); - svm->set_epsilon(epsilon); - SG_SPRINT("Starting training\n"); - CTime* timer = new CTime(); - svm->train(); - float64_t secs = timer->cur_runtime_diff_sec(); - timer->stop(); - SG_UNREF(timer); - SG_SPRINT("Training completed, took %fs\n", secs); - /** Training completed */ - - /** Evaluating */ - CBinaryLabels* predicted = CLabelsFactory::to_binary(svm->apply()); - CPRCEvaluation* prc_evaluator = new CPRCEvaluation(); - CROCEvaluation* roc_evaluator = new CROCEvaluation(); - CAccuracyMeasure* accuracy_evaluator = new CAccuracyMeasure(); - - float64_t auROC = roc_evaluator->evaluate(predicted, labels); - float64_t auPRC = prc_evaluator->evaluate(predicted, labels); - float32_t accuracy = accuracy_evaluator->evaluate(predicted, labels); - SG_SPRINT("Training auPRC=%f, auROC=%f, accuracy=%f ( Incorrectly predicted=%f% )\n", auPRC, auROC, - accuracy, (1-accuracy) * 100); - - SG_UNREF(predicted); - SGMatrix w = r_feats->get_random_coefficients(); - svm->set_features(NULL); - - if (testpath!=0) - { - sparse_data = load_data(testpath, label_vec); - label = SGVector(label_vec, sparse_data.num_vectors); - - s_feats = new CSparseFeatures(sparse_data); - r_feats = new CRandomFourierDotFeatures(s_feats, D, KernelName::GAUSSIAN, width, w); - CBinaryLabels* test_labels = new CBinaryLabels(label); - - predicted = CLabelsFactory::to_binary(svm->apply(r_feats)); - auROC = roc_evaluator->evaluate(predicted, test_labels); - auPRC = prc_evaluator->evaluate(predicted, test_labels); - accuracy = accuracy_evaluator->evaluate(predicted, test_labels); - SG_SPRINT("Test auPRC=%f, auROC=%f, accuracy=%f ( Incorrectly predicted=%f% )\n", auPRC, auROC, - accuracy, (1-accuracy) * 100); - SG_UNREF(predicted); - SG_UNREF(test_labels); - - } - SG_UNREF(prc_evaluator); - SG_UNREF(roc_evaluator); - SG_UNREF(accuracy_evaluator); - SG_UNREF(svm); - SG_UNREF(labels); - exit_shogun(); -} diff --git a/applications/classification/random_fourier_classification.py b/applications/classification/random_fourier_classification.py deleted file mode 100644 index 07350cde384..00000000000 --- a/applications/classification/random_fourier_classification.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python - -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# Written (W) 2013 Evangelos Anagnostopoulos -# - -def parse_arguments(): - import argparse - parser = argparse.ArgumentParser(description= - "Solve binary classification problems stored in libsvm format, " - "using Random Fourier features and SVMOcas") - parser.add_argument('--dataset', required=True, type=str, - help='Path to training dataset in LibSVM format.') - parser.add_argument('--testset', type=str, - help='Path to test dataset in LibSVM format.') - parser.add_argument('-D', default=300, type=int, - help='The number of samples to use') - parser.add_argument('-C', default=0.1, type=float, - help='SVMOcas regularization constant') - parser.add_argument('--epsilon', default=0.01, type=float, - help='SVMOcas epsilon parameter') - parser.add_argument('--width', default=8, type=float, - help='Width of the Gaussian Kernel to approximate') - parser.add_argument('--dimension', type=int, - help='Dimension of input dataset') - - return parser.parse_args() - -def evaluate(predicted_labels, labels, prefix="Results"): - from shogun import PRCEvaluation, ROCEvaluation, AccuracyMeasure - - prc_evaluator = PRCEvaluation() - roc_evaluator = ROCEvaluation() - acc_evaluator = AccuracyMeasure() - - auPRC = prc_evaluator.evaluate(predicted_labels, labels) - auROC = roc_evaluator.evaluate(predicted_labels, labels) - acc = acc_evaluator.evaluate(predicted_labels, labels) - - print ('{0}: auPRC = {1:.5f}, auROC = {2:.5f}, acc = {3:.5f} '+ - '({4}% incorrectly classified)').format( - prefix, auPRC, auROC, acc, (1-acc)*100) - -def load_sparse_data(filename, dimension=None): - input_file = LibSVMFile(args.dataset) - sparse_feats = SparseRealFeatures() - label_array = sparse_feats.load_with_labels(input_file) - labels = BinaryLabels(label_array) - - if dimension!=None: - sparse_feats.set_num_features(dimension) - - return {'data':sparse_feats, 'labels':labels} - -if __name__=='__main__': - from shogun import SparseRealFeatures, RandomFourierDotFeatures, GAUSSIAN - from shogun import LibSVMFile, BinaryLabels, SVMOcas - from shogun import Time - from numpy import array - - args = parse_arguments() - - print 'Loading training data...' - sparse_data = load_sparse_data(args.dataset,args.dimension) - - kernel_params = array([args.width], dtype=float) - rf_feats = RandomFourierDotFeatures(sparse_data['data'], args.D, GAUSSIAN, - kernel_params) - - svm = SVMOcas(args.C, rf_feats, sparse_data['labels']) - svm.set_epsilon(args.epsilon) - print 'Starting training.' - timer = Time() - svm.train() - timer.stop() - print 'Training completed, took {0:.2f}s.'.format(timer.time_diff_sec()) - - predicted_labels = svm.apply() - evaluate(predicted_labels, sparse_data['labels'], 'Training results') - - if args.testset!=None: - random_coef = rf_feats.get_random_coefficients() - # removing current dataset from memory in order to load the test dataset, - # to avoid running out of memory - rf_feats = None - svm.set_features(None) - svm.set_labels(None) - sparse_data = None - - print 'Loading test data...' - sparse_data = load_sparse_data(args.testset, args.dimension) - rf_feats = RandomFourierDotFeatures(sparse_data['data'], args.D, GAUSSIAN, - kernel_params, random_coef) - predicted_labels = svm.apply(rf_feats) - evaluate(predicted_labels, sparse_data['labels'], 'Test results') diff --git a/applications/classification/train_multiclass_svm.py b/applications/classification/train_multiclass_svm.py deleted file mode 100644 index 39e17d2619c..00000000000 --- a/applications/classification/train_multiclass_svm.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) The Shogun Machine Learning Toolbox -# Written (w) 2014 Daniel Pyrathon -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# The views and conclusions contained in the software and documentation are those -# of the authors and should not be interpreted as representing official policies, -# either expressed or implied, of the Shogun Development Team. - - -import argparse -import logging -from contextlib import contextmanager, closing -from shogun import (LibSVMFile, GaussianKernel, MulticlassLibSVM, - SerializableHdf5File, LinearKernel) -from utils import get_features_and_labels, track_execution - -LOGGER = logging.getLogger(__file__) - -KERNELS = { - 'linear': lambda feats, width: LinearKernel(feats, feats), - 'gaussian': lambda feats, width: GaussianKernel(feats, feats, width), -} - -def parse_arguments(): - parser = argparse.ArgumentParser(description="Train a multiclass SVM \ - stored in libsvm format") - parser.add_argument('--dataset', required=True, type=str, - help='Path to training dataset in LibSVM format.') - parser.add_argument('--capacity', default=1.0, type=float, - help='SVM capacity parameter') - parser.add_argument('--width', default=2.1, type=float, - help='Width of the Gaussian Kernel to approximate') - parser.add_argument('--epsilon', default=0.01, type=float, - help='SVMOcas epsilon parameter') - parser.add_argument('--kernel', type=str, default='linear', - choices=['linear', 'gaussian'], - help='Optionally specify a kernel type. \ - Only Linear or Gaussian') - parser.add_argument('--output', required=True, type=str, - help='Destination path for the output serialized \ - classifier') - return parser.parse_args() - - -def main(dataset, output, epsilon, capacity, width, kernel_type): - - LOGGER.info("SVM Multiclass classifier") - LOGGER.info("Epsilon: %s" % epsilon) - LOGGER.info("Capacity: %s" % capacity) - LOGGER.info("Gaussian width: %s" % width) - - # Get features - feats, labels = get_features_and_labels(LibSVMFile(dataset)) - - # Create kernel - try: - kernel = KERNELS[kernel_type](feats, width) - except KeyError: - LOGGER.error("Kernel %s not available. try Gaussian or Linear" % kernel_type) - - # Initialize and train Multiclass SVM - svm = MulticlassLibSVM(capacity, kernel, labels) - svm.set_epsilon(epsilon) - with track_execution(): - svm.train() - - # Serialize to file - writable_file = SerializableHdf5File(output, 'w') - with closing(writable_file): - svm.save_serializable(writable_file) - LOGGER.info("Serialized classifier saved in: '%s'" % output) - - -if __name__ == '__main__': - args = parse_arguments() - main(args.dataset, args.output, args.epsilon, args.capacity, args.width, args.kernel) diff --git a/applications/classification/utils.py b/applications/classification/utils.py deleted file mode 100644 index e459c39e97e..00000000000 --- a/applications/classification/utils.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) The Shogun Machine Learning Toolbox -# Written (w) 2014 Daniel Pyrathon -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# The views and conclusions contained in the software and documentation are those -# of the authors and should not be interpreted as representing official policies, -# either expressed or implied, of the Shogun Development Team. - -import logging -from contextlib import contextmanager -from shogun import MulticlassLabels, SparseRealFeatures, Time - - -logging.basicConfig(level=logging.INFO, format='[%(asctime)-15s %(module)s] %(message)s') -LOGGER = logging.getLogger(__file__) - -def get_features_and_labels(input_file): - feats = SparseRealFeatures() - label_array = feats.load_with_labels(input_file) - labels = MulticlassLabels(label_array) - return feats, labels - -@contextmanager -def track_execution(): - LOGGER.info('Starting training.') - timer = Time() - yield - timer.stop() - LOGGER.info('Training completed, took {0:.2f}s.'.format(timer.time_diff_sec())) diff --git a/applications/easysvm/AUTHORS b/applications/easysvm/AUTHORS deleted file mode 100644 index 7c0e1b4662c..00000000000 --- a/applications/easysvm/AUTHORS +++ /dev/null @@ -1,3 +0,0 @@ -Cheng Soon Ong -Gunnar Raetsch -Sebastian Schultheiss diff --git a/applications/easysvm/LICENSE b/applications/easysvm/LICENSE deleted file mode 100644 index 94a9ed024d3..00000000000 --- a/applications/easysvm/LICENSE +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/applications/easysvm/MANIFEST.in b/applications/easysvm/MANIFEST.in deleted file mode 100644 index 9c0f080546d..00000000000 --- a/applications/easysvm/MANIFEST.in +++ /dev/null @@ -1,7 +0,0 @@ -include AUTHORS -include distutils-help.txt -include LICENSE -include README -recursive-include dependencies * -recursive-include galaxy * -recursive-include data *.csv *.fa diff --git a/applications/easysvm/README b/applications/easysvm/README deleted file mode 100644 index b24ec868c44..00000000000 --- a/applications/easysvm/README +++ /dev/null @@ -1,142 +0,0 @@ -.. This document is written in reStructuredText. -.. Build command: - rst2html.py --date --time README README.html - -============================================= - easysvm - A front end to the shogun toolbox -============================================= - -.. contents:: - -Introduction -============ - -This is a demo corresponding to the PLoS tutorial -"Support Vector Machines for Sequence Analysis". It is also meant as a -user "quick start" to using shogun (http://www.shogun-toolbox.org). - - -Installation -============ - -Install -------- - -For a global install, for which you need root permissions - - python setup.py install - -For a local install - - python setup.py install --prefix=$HOME - -See distutils-help.txt for more details. - -Dependencies ------------- - -- `numpy`_ (>=1.0.1) -- `pylab`_ (>=0.87.7) [optional] -- `shogun`_ (>=0.7.3) -- `arff`_ [optional] - -.. _numpy: http://numpy.scipy.org/ -.. _pylab: http://matplotlib.sourceforge.net/ -.. _shogun: http://www.shogun-toolbox.org/ -.. _arff: http://www.mit.edu/~sav/arff/ - - -Usage -===== - -The results in the paper were produced by tutorial_example.py. Execute -it in the data directory:: - - cd data - python ../splicesites/tutorial_example.py - -Galaxy interface ----------------- - -The following command line arguments are what is behind the galaxy -interface, which is available as a web service from -http://galaxy.fml.tuebingen.mpg.de/ - -There are three types of data creation methods:: - - datagen.py motif arff gattaca 10 50 10-15 0.1 tttt 100 50 15 0.1 testmotif1.arff - datagen.py cloud 100 3 0.6 1.3 testcloud1.arff - datagen.py motif arff gattaca 100 50 10-15 0.1 tttt 1000 50 15 0.1 testmotif2.arff - datagen.py cloud 1000 3 0.6 1.3 testcloud2.arff - - datagen.py motif fasta gattaca 10 50 10-15 0.1 testmotifpos.fasta - datagen.py motif fasta tttt 100 50 15 0.1 testmotifneg.fasta - datagen.py motif fasta gattaca 100 50 10-15 0.1 tm1.fasta - datagen.py motif fasta tttt 1000 50 15 0.1 tm2.fasta - -Clean up:: - - cat tm1.fasta tm2.fasta > testmotiftest.fasta - rm tm1.fasta tm2.fasta - - -Cross validation and evaluation on a independent validation set:: - - easysvm.py cv 5 10 gauss 0.6 arff testcloud1.arff cv_cloud.txt - easysvm.py eval cv_cloud.txt arff testcloud1.arff cv_cloud_eval.txt roc roc_cloud_cv.png - easysvm.py cv 5 10 wd 10 2 arff testmotif1.arff cv_motif.txt - easysvm.py eval cv_motif.txt arff testmotif1.arff cv_motif_eval.txt roc roc_motif_cv.png - -Predict on a test set:: - - easysvm.py pred 10 gauss 0.6 arff testcloud1.arff testcloud2.arff pred_cloud.txt - easysvm.py pred 10 linear arff testcloud1.arff testcloud2.arff pred_cloud.txt - easysvm.py pred 10 poly 3 true true arff testcloud1.arff testcloud2.arff pred_cloud.txt - - easysvm.py pred 10 wd 10 2 arff testmotif1.arff testmotif2.arff pred_motif.txt - easysvm.py pred 10 localalign arff testmotif1.arff testmotif2.arff pred_motif.txt - easysvm.py pred 10 localimprove 10 1 1 arff testmotif1.arff testmotif2.arff pred_motif.txt - -For some kernels, investigate the importance of different motives:: - - easysvm.py poim 10 6 wd 10 2 arff testmotif1.arff poims.png - -We also support the fasta format:: - - easysvm.py cv 5 10 wd 10 2 fasta testmotifpos.fasta testmotifneg.fasta cv_motif.txt - easysvm.py eval cv_motif.txt fasta testmotifpos.fasta testmotifneg.fasta cv_motif_eval.txt roc roc_motif_cv.png - easysvm.py pred 10 wd 10 2 fasta testmotifpos.fasta testmotifneg.fasta testmotiftest.fasta pred_motif.txt - easysvm.py poim 10 6 wd 10 2 fasta testmotifpos.fasta testmotifneg.fasta poims.png - -Clean up:: - - rm testmotif1.arff testmotif2.arff testcloud1.arff testcloud2.arff - rm cv_cloud.txt roc_cloud_cv.png cv_motif.txt roc_motif_cv.png - rm pred_cloud.txt pred_motif.txt poims.png - rm testmotifpos.fasta testmotifneg.fasta testmotiftest.fasta - rm cv_cloud_eval.txt cv_motif_eval.txt - - -License -======= - -GPLv3_ - -.. _GPLv3: http://gplv3.fsf.org/ - -All programs in this collection are free software: -you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . - -Copyright 2008 Cheng Soon Ong and Gunnar Raetsch - diff --git a/applications/easysvm/data b/applications/easysvm/data deleted file mode 120000 index f792f7b7b32..00000000000 --- a/applications/easysvm/data +++ /dev/null @@ -1 +0,0 @@ -../../data/easysvm \ No newline at end of file diff --git a/applications/easysvm/distutils-help.txt b/applications/easysvm/distutils-help.txt deleted file mode 100644 index 348384620c1..00000000000 --- a/applications/easysvm/distutils-help.txt +++ /dev/null @@ -1,73 +0,0 @@ -A quick guide to distribution of python code. How to install and how -to build a module or package in python using distutils (setup.py) - -Installation: -------------- - -$ python setup.py install -This builds and installs in the default python -site-packages location. If you are not root, then the other two -versions allow you to specify a root directory. There is a tiny -difference between --home and --prefix. - -$ python setup.py install --home=$HOME -installs the package into $HOME/lib/python/package-name - -$ python setup.py install --prefix=$HOME -installs the package into -$HOME/lib/python2.5/site-packages/package-name -(assuming you are using python2.5, which I recommend) - -I suggest using -$ python setup.py install --prefix=$HOME -because then you can just add -export PYTHONPATH=$HOME/lib/python2.5/site-packages/package-name:$PYTHONPATH -to your .bashrc and you can just install all your downloaded python -packages that use distutils in the same way. Also, it allows you to -keep the packages for python2.3 and python2.4 separate (which --home -doesn't). - -$ python setup.py build -builds the software without installing it. This can also be used by -the developer instead of a makefile/configure type script to compile -his/her own code. - - -Packaging your code for distribution: -------------------------------------- - -You have to basically write a setup.py file that tells python -distutils where things are. distutils is smart enough to figure out -which compiler to use for C/C++/python. It even knows when to invoke -SWIG. The following instructions are from the distutils documentation. - -If all you want to do is distribute a module called foo, contained in -a file foo.py, then your setup script can be as simple as this: - -from distutils.core import setup -setup(name='foo', - version='1.0', - py_modules=['foo'], - ) - -Some observations: -* most information that you supply to the Distutils is supplied as -keyword arguments to the setup() function -* those keyword arguments fall into two categories: package metadata -(name, version number) and information about what's in the package (a -list of pure Python modules, in this case) -* modules are specified by module name, not filename (the same will -hold true for packages and extensions) -* it's recommended that you supply a little more metadata, in -particular your name, email address and a URL for the project. - -To create a source distribution for this module, you would create a -setup script, setup.py, containing the above code, and run: -$ python setup.py sdist -which will create an archive file (e.g., tarball on Unix, ZIP file on -Windows) containing your setup script setup.py, and your module -foo.py. The archive file will be named foo-1.0.tar.gz (or .zip), and -will unpack into a directory foo-1.0. - -For more information, look at the documentation for "Installing Python -Modules" and "Distributing Python Modules". diff --git a/applications/easysvm/esvm/__init__.py b/applications/easysvm/esvm/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/applications/easysvm/esvm/datafuncs.py b/applications/easysvm/esvm/datafuncs.py deleted file mode 100644 index 5e84d8feac0..00000000000 --- a/applications/easysvm/esvm/datafuncs.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -This module contains code for generating toy examples -""" - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import parse - -import random -from numpy.random import randn -from numpy import ones, concatenate, array, transpose -from esvm.mldata import DatasetFileFASTA, init_datasetfile -from esvm.mldata_arff import DatasetFileARFF - -class MotifDataDef(object): - motif = '' - numseq = 0 - seqlenmin = 0 - seqlenmax = 0 - posstart = 0 - posend = 0 - mutrate = 0.0 - -################################################################################ -# data generation functions - -def motifgen(motif, numseq, seqlenmin, seqlenmax, posstart, posend, mutrate): - """Generate sequences with a particular motif at a particular location. - Also allow a possible mutation rate of the motif. - """ - - metadata = 'motifgen(%s,%d,%d,%d,%d,%d,%1.2f)' % (motif, numseq, seqlenmin, seqlenmax, posstart, posend, mutrate) - - acgt='acgt' - seqlist = [] - for i in xrange(0,numseq): - str=[] ; - seqlen=random.randint(seqlenmin,seqlenmax) ; - for l in xrange(0,seqlen): - str.append(acgt[random.randint(0,3)]) - pos=random.randint(posstart,posend) ; - for l in xrange(0,len(motif)): - if (random.random()>=mutrate) and (pos+l=0): - str[pos+l]=motif[l] - seqlist.append(''.join(str).upper()) - - return metadata, seqlist - - -def cloudgen(numpoint, numfeat, fracpos, width): - """Generate two Gaussian point clouds, centered around one and minus one.""" - - numpos = int(round(fracpos*numpoint)) - numneg = numpoint - numpos - - metadata = 'cloudgen(%d,%d,%d,%3.2f)' % (numpos, numneg, numfeat, width) - - datapos = ones((numfeat, numpos)) + width*randn(numfeat, numpos) - dataneg = -ones((numfeat, numneg)) + width*randn(numfeat, numneg) - pointcloud = concatenate((datapos,dataneg),axis=1) - labels = concatenate((ones(numpos),-ones(numneg))) - - return metadata, pointcloud, labels - - - - - -################################################################################ -# ARFF functions - -def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0): - """Write an ARFF file containing a vectorial dataset""" - #import arff - - (metadata, pointcloud, labels) = cloudgen(numpoint, numfeat, fracpos, width) - - fp = init_datasetfile(filename,'vec') - fp.comment = metadata - fp.dataname = 'pointcloud' - fp.writelines(pointcloud,labels) - - -def arffwrite_sequence(filename,p, n): - """Write an ARFF file containing a sequence dataset""" - #import arff - - (metadatapos,seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) - (metadataneg,seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate) - - labels = concatenate((ones(len(seqlistpos)),-ones(len(seqlistneg)))) - seqlist = seqlistpos + seqlistneg - fp = init_datasetfile(filename,'seq') - fp.comment = metadatapos+' '+metadataneg - fp.dataname = 'motif' - fp.writelines(seqlist,labels) - - - -def arffread(kernelname,datafilename): - """Decide based on kernelname whether to read a sequence or vectorial file""" - - if kernelname == 'gauss' or kernelname == 'linear' or kernelname == 'poly' or kernelname == None: - fp = init_datasetfile(datafilename,'vec') - elif kernelname == 'wd' or kernelname == 'localalign' or kernelname == 'localimprove'\ - or kernelname == 'spec' or kernelname == 'cumspec': - fp = init_datasetfile(datafilename,'seq') - elif kernelname == 'spec2' or kernelname == 'cumspec2': - fp = init_datasetfile(datafilename,'mseq') - else: - print 'Unknown kernel in arffread' - - return fp.readlines() - -################################################################################ -# fasta functions - -def fastawrite_sequence(filename,p): - """Write a FASTA file containing a sequence dataset""" - import arff - - (metadata,seqlist) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) - labels = ones(len(seqlist)) - fp = init_datasetfile(filename,'seq') - fp.writelines(seqlist,labels) - -def fastaread(fnamepos,fnameneg=None): - """Read two fasta files, the first positive, the second negative""" - fpos = init_datasetfile(fnamepos,'seq') - (fa1,lab1) = fpos.readlines() - - if fnameneg is not None: - fneg = init_datasetfile(fnameneg,'seq') - (fa2,lab2) = fneg.readlines() - - print 'positive: %d, negative %d' % (len(fa1),len(fa2)) - all_labels = concatenate((ones(len(fa1)),-ones(len(fa2)))) - all_examples = fa1 + fa2 - else: - all_examples = fa1 - all_labels = ones(len(fa1)) - - return all_examples, all_labels - diff --git a/applications/easysvm/esvm/experiment.py b/applications/easysvm/esvm/experiment.py deleted file mode 100644 index ed4c3a79e46..00000000000 --- a/applications/easysvm/esvm/experiment.py +++ /dev/null @@ -1,773 +0,0 @@ -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import random - -import shutil -import numpy -from numpy import sign, where, array, ones -import parse -import utils -from poim import compute_poims - -import shogun -from shogun import GaussianKernel, WeightedDegreePositionStringKernel -from shogun import WeightedDegreeStringKernel -from shogun import LinearKernel, PolyKernel, LocalAlignmentStringKernel -from shogun import LocalityImprovedStringKernel -from shogun import CommWordStringKernel, WeightedCommWordStringKernel, CommUlongStringKernel -from shogun import CombinedKernel -from shogun import SLOWBUTMEMEFFICIENT -from shogun import AvgDiagKernelNormalizer -from shogun import RealFeatures, Labels, StringCharFeatures, DNA, StringWordFeatures, StringUlongFeatures, PROTEIN -from shogun import CombinedFeatures -from shogun import LibSVM,GPBTSVM - -DefaultSVM = LibSVM -try: - from shogun import SVMLight - LinAddSVM = SVMLight - LinearSVM = SVMLight -except: - LinAddSVM = GPBTSVM - LinearSVM = LibSVM - -from shogun import SortWordString, SortUlongString - -from utils import calcprc, calcroc, accuracy -from utils import getPartitionedSet, getCurrentSplit -import plots -import re -from poim import reshape_normalize_contribs, compute_weight_mass - -################################################################################ -def non_atcg_convert(seq, nuc_con): - """ Converts Non ATCG characters from DNA sequence """ - - if nuc_con == '':sys.stderr.write("usage: Provide a choice for non ACGT nucleotide conversion [T|A|C|G|R|Y|N] at last\n");sys.exit(-1) - if re.match(r'[^ATCGRYN]', nuc_con):sys.stderr.write("usage: Conversion nucleotide choice -"+ nuc_con +"- failed. pick one from [T|A|C|G|R|Y|N]\n");sys.exit(-1) - - nuc_con = nuc_con.upper() - mod_seq = [] - for i in range(len(seq)): - if re.search(r'[^ACTG]', seq[i], re.IGNORECASE): - if nuc_con == 'A' or nuc_con == 'T' or nuc_con == 'C' or nuc_con == 'G': - seq[i] = re.sub(r'[^ATCG|actg]', nuc_con, seq[i]) - seq[i] = seq[i].upper() - mod_seq.append(seq[i]) - continue - if nuc_con == 'N':(nucleotide, line) = ('ATCG', '') - if nuc_con == 'R':(nucleotide, line) = ('AG', '') - if nuc_con == 'Y':(nucleotide, line) = ('TC', '') - - for single_nuc in seq[i]: - if re.match(r'[^ACGT]', single_nuc, re.IGNORECASE): - line += random.choice(nucleotide) - else: - line += single_nuc.upper() - mod_seq.append(line) - else: - seq[i] = seq[i].upper() - mod_seq.append(seq[i]) - return mod_seq - -def non_aminoacid_converter(seq, amino_con): - """ Converts Non amino acid characters from protein sequence """ - - if amino_con == '':sys.stderr.write("usage: Provide a choice for replacing non amino acid characters\n");sys.exit(-1) - flag = 0 - if len(amino_con)>1: - if amino_con != 'random':flag = 1 - else: - if re.match(r'[^GPAVLIMCFYWHKRQNEDST]', amino_con, re.IGNORECASE):flag = 1 - if flag == 1:sys.stderr.write("usage: Replace aminoacid chioce -"+ amino_con +"- failed. Pick a valid aminoacid single letter code/random\n");sys.exit(-1) - - amino_con = amino_con.upper() - opt_seq = [] - for i in range(len(seq)): - if re.search(r'[^GPAVLIMCFYWHKRQNEDST]', seq[i], re.IGNORECASE): - if amino_con == 'RANDOM': - aminoacid = 'GPAVLIMCFYWHKRQNEDST' - line = '' - for single_amino in seq[i]: - if re.match(r'[^GPAVLIMCFYWHKRQNEDST]', single_amino, re.IGNORECASE): - r_amino = random.choice(aminoacid) - line += r_amino - else: - single_amino = single_amino.upper() - line += single_amino - opt_seq.append(line) - else: - seq[i] = re.sub(r'[^GPAVLIMCFYWHKRQNEDST|gpavlimcfywhkrqnedst]', amino_con, seq[i]) - seq[i] = seq[i].upper() - opt_seq.append(seq[i]) - else: - seq[i] = seq[i].upper() - opt_seq.append(seq[i]) - return opt_seq -# helper functions - -def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): - """Converts numpy arrays or sequences into shogun features""" - - if kname == 'gauss' or kname == 'linear' or kname == 'poly': - examples = numpy.array(examples) - feats = RealFeatures(examples) - - elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': - if seq_source == 'dna': - examples = non_atcg_convert(examples, nuc_con) - feats = StringCharFeatures(examples, DNA) - elif seq_source == 'protein': - examples = non_aminoacid_converter(examples, nuc_con) - feats = StringCharFeatures(examples, PROTEIN) - else: - sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") - sys.exit(-1) - - elif kname == 'spec' or kname == 'cumspec': - if seq_source == 'dna': - examples = non_atcg_convert(examples, nuc_con) - feats = StringCharFeatures(examples, DNA) - elif seq_source == 'protein': - examples = non_aminoacid_converter(examples, nuc_con) - feats = StringCharFeatures(examples, PROTEIN) - else: - sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") - sys.exit(-1) - - wf = StringUlongFeatures( feats.get_alphabet() ) - wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') - del feats - - if train_mode: - preproc = SortUlongString() - preproc.init(wf) - wf.add_preprocessor(preproc) - ret = wf.apply_preprocessor() - #assert(ret) - - feats = wf - elif kname == 'spec2' or kname == 'cumspec2': - # spectrum kernel on two sequences - feats = {} - feats['combined'] = CombinedFeatures() - - reversed = kname=='cumspec2' - - (ex0,ex1) = zip(*examples) - - f0 = StringCharFeatures(list(ex0), DNA) - wf = StringWordFeatures(f0.get_alphabet()) - wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) - del f0 - - if train_mode: - preproc = SortWordString() - preproc.init(wf) - wf.add_preprocessor(preproc) - ret = wf.apply_preprocessor() - assert(ret) - feats['combined'].append_feature_obj(wf) - feats['f0'] = wf - - f1 = StringCharFeatures(list(ex1), DNA) - wf = StringWordFeatures( f1.get_alphabet() ) - wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) - del f1 - - if train_mode: - preproc = SortWordString() - preproc.init(wf) - wf.add_preprocessor(preproc) - ret = wf.apply_preprocessor() - assert(ret) - feats['combined'].append_feature_obj(wf) - feats['f1'] = wf - - else: - print 'Unknown kernel %s' % kname - - return (feats,preproc) - -def create_kernel(kname,kparam,feats_train): - """Call the corresponding constructor for the kernel""" - - if kname == 'gauss': - kernel = GaussianKernel(feats_train, feats_train, kparam['width']) - elif kname == 'linear': - kernel = LinearKernel(feats_train, feats_train) - kernel.set_normalizer(AvgDiagKernelNormalizer(kparam['scale'])) - elif kname == 'poly': - kernel = PolyKernel(feats_train, feats_train, kparam['degree'], kparam['inhomogene'], kparam['normal']) - elif kname == 'wd': - kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, kparam['degree']) - kernel.set_normalizer(AvgDiagKernelNormalizer(float(kparam['seqlength']))) - kernel.set_shifts(kparam['shift']*numpy.ones(kparam['seqlength'],dtype=numpy.int32)) - #kernel=WeightedDegreeStringKernel(feats_train, feats_train, kparam['degree']) - elif kname == 'spec': - kernel = CommUlongStringKernel(feats_train, feats_train) - elif kname == 'cumspec': - kernel = WeightedCommWordStringKernel(feats_train, feats_train) - kernel.set_weights(numpy.ones(kparam['degree'])) - elif kname == 'spec2': - kernel = CombinedKernel() - k0 = CommWordStringKernel(feats_train['f0'], feats_train['f0']) - k0.io.disable_progress() - kernel.append_kernel(k0) - k1 = CommWordStringKernel(feats_train['f1'], feats_train['f1']) - k1.io.disable_progress() - kernel.append_kernel(k1) - elif kname == 'cumspec2': - kernel = CombinedKernel() - k0 = WeightedCommWordStringKernel(feats_train['f0'], feats_train['f0']) - k0.set_weights(numpy.ones(kparam['degree'])) - k0.io.disable_progress() - kernel.append_kernel(k0) - k1 = WeightedCommWordStringKernel(feats_train['f1'], feats_train['f1']) - k1.set_weights(numpy.ones(kparam['degree'])) - k1.io.disable_progress() - kernel.append_kernel(k1) - elif kname == 'localalign': - kernel = LocalAlignmentStringKernel(feats_train, feats_train) - elif kname == 'localimprove': - kernel = LocalityImprovedStringKernel(feats_train, feats_train, kparam['length'],\ - kparam['indeg'], kparam['outdeg']) - else: - print 'Unknown kernel %s' % kname - - kernel.set_cache_size(32) - return kernel - -def create_combined_kernel(kname, kparam, examples, train_mode, preproc): - """A wrapper for creating combined kernels. - - kname, kparam and examples are lists. - - """ - num_kernels = len(kname) - feats['combined'] = CombinedFeatures() - kernel = CombinedKernel() - - for kix in xrange(num_kernels): - cur_kname = '%s%d' % (kname[kix],kix) - (cur_feats, cur_preproc) = create_features(kname[kix], examples[kix], kparam[kix], train_mode, preproc) - feats[cur_kname] = cur_feats - cur_kernel = create_kernel(kname[kix], kparam[kix], cur_feats) - kernel.append_kernel(cur_kernel) - - return (feats,kernel) - -def model2str(kparam,C,kp,shownames=True): - """Generates a string describing the model parameters""" - - if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1: - if shownames: - str="\tC=%1.1f" % C - else: - str="\t%1.2f" % C - else: - if type(kp)==type(int(0)): - if shownames: - str="\tC=%1.1f\t%s=%i" %(C, kparam["modelsel_name"], kp) - else: - str="\t%1.1f\t%i" %(C, kp) - else: - if shownames: - str="\tC=%1.1f\t%s=%1.2f" %(C, kparam["modelsel_name"], kp) - else: - str="\t%1.1f\t%1.2f" %(C, kp) - return str - - - -def train(trainex,trainlab,C,kname,kparam,seq_source,nuc_con): - """Trains a SVM with the given kernel""" - - (feats_train, preproc) = create_features(kname,trainex, kparam, True, None, seq_source, nuc_con) - - if kname == 'wd': - kparam['seqlength'] = len(trainex[0]) - kernel = create_kernel(kname,kparam,feats_train) - - if kname == 'spec2' or kname == 'cumspec2': - kernel.init(feats_train['combined'], feats_train['combined']) - else: - kernel.init(feats_train, feats_train) - kernel.io.disable_progress() - kernel.set_optimization_type(SLOWBUTMEMEFFICIENT) - labels = BinaryLabels(numpy.array(trainlab,numpy.double)) - - # libsvm is fine for most kernels - if kname in ('wd', 'spec', 'cumspec', 'spec2', 'cumspec2'): - # for the string kernels there exist specific optimizations that are only effective when using - # a LinAdd SVM implementation (e.g. SVM-light or GPBT-SVM) - SVMClass = LinAddSVM - elif kname == 'linear': - SVMClass = LinearSVM - else: - SVMClass=DefaultSVM - - svm = SVMClass(C, kernel, labels) - - svm.io.disable_progress() - svm.set_batch_computation_enabled(True) - svm.set_linadd_enabled(True) - svm.set_epsilon(1e-5) - svm.parallel.set_num_threads(svm.parallel.get_num_cpus()) - - svm.train() - - return (svm, kernel, feats_train, preproc) - -def train_and_test(trainex,trainlab,testex,C,kname,kparam, seq_source, nuc_con): - """Trains a SVM with the given kernel, and predict on the test examples""" - - (svm, kernel, feats_train, preproc) = train(trainex,trainlab,C,kname,kparam,seq_source,nuc_con) - (feats_test, preproc) = create_features(kname, testex, kparam, False, preproc, seq_source, nuc_con) - if kname == 'spec2' or kname == 'cumspec2': - for feats in feats_train.values(): - feats.io.disable_progress() - for feats in feats_test.values(): - feats.io.disable_progress() - kernel.init(feats_train['combined'], feats_test['combined']) - else: - feats_train.io.disable_progress() - feats_test.io.disable_progress() - kernel.init(feats_train, feats_test) - - kernel.set_optimization_type(SLOWBUTMEMEFFICIENT) - output = svm.apply().get_labels() - - return output - -def crossvalidation(cv, kname, kparam, C, all_examples, all_labels, seq_source, nuc_con): - """Perform cross validation using an SVM - - cv -- the number of folds - kernel -- the kernel used - data -- the dataset, assumed to be compatible to kernel, label is in the first column - - """ - print 'Using %i-fold crossvalidation' % cv - partitions = getPartitionedSet(len(all_labels), cv) - error = [] - sum_accuracy = 0.0 - sum_roc = 0.0 - all_outputs=[0.0] * len(all_labels) - all_split=[-1] * len(all_labels) - - for repetition in xrange(cv): - XT, LT, XTE, LTE = getCurrentSplit(repetition, partitions, all_labels, all_examples) - numpos = len(where(array(LTE)>0)[0]) - svmout = train_and_test(XT, LT, XTE, C, kname, kparam, seq_source, nuc_con) - - for i in xrange(len(svmout)): - all_outputs[partitions[repetition][i]] = svmout[i] - all_split[partitions[repetition][i]] = repetition ; - - return (all_outputs, all_split) - -def evaluate(predictions, splitassignments, labels, roc_fname=None, prc_fname=None): - """Evaluate prediction results - """ - - res_str = "" - - cv = 1 - if splitassignments!=None: - for split in splitassignments: - if split+1>cv: - cv=int(split+1) - if cv>1: - res_str = "Evaluating on %i examples in %i splits\n" % (len(labels),cv) - else: - res_str = "Evaluating on %i examples\n" % len(labels) - - output_splits = cv* [[]] - label_splits = cv* [[]] - for i in xrange(cv): - label_splits[i]=[] - output_splits[i]=[] - - for i in xrange(0,len(labels)): - if cv>1: - split=int(splitassignments[i]) - else: - split=0 - output_splits[split].append(predictions[i]) - label_splits[split].append(labels[i]) - - error = [] - sum_accuracy = 0.0 - sum_roc = 0.0 - sum_prc = 0.0 - - for split in xrange(cv): - res_str += 'Split %d\n' % (split+1) - - LTE = label_splits[split] ; - svmout = output_splits[split] - - numpos=0 - for l in LTE: - if l==1: - numpos+=1 - istwoclass = numpos>0 and numpos3:sys.stderr.write("Too many arguments\n");sys.exit(-1) - seq_source = argv_rest[1] - nuc_con = argv_rest[2] - - if kernelname == 'linear' or kernelname == 'gauss' or kernelname == 'poly': - if len(argv_rest)<1:sys.stderr.write("outputfile misssing\n");sys.exit(-1) - if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1) - outfilename = argv_rest[0] - - utils.check_params(kparam, C, len(examples[0])) - - # run cross-validation - (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) - try: - f = open(outfilename, 'w+') - except: - sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') - sys.exit(-1) - res_str = '#example\toutput\tsplit\n' - f.write(res_str) - for ix in xrange(len(all_outputs)): - res_str = '%d\t%2.7f\t%d\n' % (ix,all_outputs[ix],all_split[ix]) - f.write(res_str) - f.close() - -def svm_modelsel(argv): - """A top level script to parse input parameters and run model selection""" - - assert(argv[1]=='modelsel') - if len(argv)<5:sys.stderr.write("usage: %s modelsel repeat Cs kernelname [kernelparameters] [arff|fasta] inputfiles outputfile [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) - - # parse input parameters - cv = int(argv[2]) - Cs = parse.parse_float_list(argv[3]) - (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], True) - (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) - - (seq_source, nuc_con) = ('', '') - if kernelname == 'spec' or kernelname == 'wd': - if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<3: - if argv_rest[-1] == 'dna': - sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") - sys.exit(-1) - elif argv_rest[-1] == 'protein': - sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") - sys.exit(-1) - else: - sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") - sys.exit(-1) - if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) - seq_source = argv_rest[1] - nuc_con = argv_rest[2] - - if kernelname == 'linear' or kernelname == 'gauss' or kernelname== 'poly': - if len(argv_rest)<1:sys.stderr.write("outputfile missing\n");sys.exit(-1) - if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1) - - outfilename = argv_rest[0] - - # run cross-validation - mean_rocs=[] ; - mean_prcs=[] ; - mean_accs=[] ; - all_Cs = [] ; - all_kparam=[] ; - - if kparam["modelsel_name"]==None: - for C in Cs: - utils.check_params(kparam, C, len(examples[0])) - - (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) - (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels) - mean_rocs.append(mean_roc) - mean_prcs.append(mean_prc) - mean_accs.append(mean_acc) - all_Cs.append(C) - all_kparam.append(None) - else: # also optimize one kernel parameter - for C in Cs: - for kp in kparam["modelsel_params"]: - kparam[kparam["modelsel_name"]] = kp - utils.check_params(kparam, C, len(examples[0])) - - (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) - (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels) - mean_rocs.append(mean_roc) - mean_prcs.append(mean_prc) - mean_accs.append(mean_acc) - all_Cs.append(C) - all_kparam.append(kp) - - max_roc=numpy.max(numpy.array(mean_rocs)) - max_prc=numpy.max(numpy.array(mean_prcs)) - max_acc=numpy.max(numpy.array(mean_accs)) - try: - f = open(outfilename, 'w+') - except: - sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') - sys.exit(-1) - if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1: - detail_str = "\tC\tROC\tPRC\tAccuracy (at threshold 0)\n" - else: - detail_str = "\tC\t%s\tROC\tPRC\tAccuracy (at threshold 0)\n" % kparam["modelsel_name"] - - best_roc_str='' - best_prc_str='' - best_acc_str='' - for i in xrange(len(all_Cs)): - # determine the best parameter combinations - if mean_rocs[i]==max_roc: - rocsym='+' - best_roc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' - else: - rocsym=' ' - if mean_prcs[i]==max_prc: - prcsym='+' - best_prc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' - else: - prcsym=' ' - if mean_accs[i]==max_acc: - accsym='+' - best_acc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' - else: - accsym=' ' - - detail_str+=model2str(kparam, all_Cs[i], all_kparam[i], False)+'\t' - if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1: - detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i]) - else: - detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i]) - - f.write('Best model(s) according to ROC measure:\n%s' % best_roc_str) - f.write('\nBest model(s) according to PRC measure:\n%s' % best_prc_str) - f.write('\nBest model(s) according to accuracy measure:\n%s' % best_acc_str) - - f.write('\nDetailed results:\n') - f.write(detail_str) - f.close() - -def svm_pred(argv): - """A top level script to parse input parameters and train and predict""" - - assert(argv[1]=='pred') - if len(argv)<6:sys.stderr.write("usage: %s pred C kernelname kernelparameters [arff|fasta] inputfiles outputfile [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) - - # parse input parameters - C = float(argv[2]) - (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[3:],False) - (trainex, trainlab, testex, argv_rest) = parse.parse_input_file_train_test(kernelname, argv_rest) - - (seq_source, nuc_con) = ('', '') - if kernelname == 'spec' or kernelname == 'wd': - if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<3: - if argv_rest[-1] == 'dna': - sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") - sys.exit(-1) - elif argv_rest[-1] == 'protein': - sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") - sys.exit(-1) - else: - sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") - sys.exit(-1) - if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) - seq_source = argv_rest[1] - nuc_con = argv_rest[2] - - if kernelname == 'linear' or kernelname== 'poly' or kernelname == 'gauss': - if len(argv_rest)<1:sys.stderr.write("outputfile missing\n");sys.exit(-1) - if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1) - - outfilename = argv_rest[0] - - utils.check_params(kparam, C, len(trainex[0])) - - # run training and testing - svmout = train_and_test(trainex, trainlab, testex, C, kernelname, kparam, seq_source, nuc_con) - - # write output file - try: - f = open(outfilename,'w') - except: - sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') - sys.exit(-1) - - res_str = '#example\toutput\n' - f.write(res_str) - for ix in xrange(len(svmout)): - res_str = str(ix)+'\t'+str(svmout[ix])+'\n' - f.write(res_str) - f.close() - -def svm_eval(argv): - """A top level script to parse input parameters and evaluate""" - - assert(argv[1]=='eval') - if len(argv)<6:sys.stderr.write("usage: %s eval predictionfile [arff|fasta] inputfiles outputfile [roc|prc figure.png]\n" % argv[0]);sys.exit(-1) - - # parse input parameters - (predictions, splitassignments) = parse.parse_prediction_file(argv[2]) - (trainex, trainlab, argv_rest) = parse.parse_input_file_train(None, argv[3:]) - if len(argv_rest)<1:sys.stderr.write("Output file missing\n");sys.exit(-1) - if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) - outfilename = argv_rest[0] - roc_fname = None - prc_fname = None - - if len(argv_rest)>2: - if argv_rest[1]=='roc': - roc_fname=argv_rest[2] - elif argv_rest[1]=='prc': - prc_fname=argv_rest[2] - else: - sys.stderr.write('Usage: [roc|prc]') - sys.exit(-1) - - # run training and testing - (res_str,mean_roc,mean_prc,mean_acc) = evaluate(predictions, splitassignments, trainlab, roc_fname, prc_fname) - - # write output file - try: - f = open(outfilename,'w') - except: - sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') - sys.exit(-1) - - f.write(res_str) - f.close() - - -def svm_poim(argv): - """A top level script to parse input parameters and plot poims""" - - assert(argv[1]=='poim') - if len(argv)<7:sys.stderr.write("usage: %s poim C poimdegree wd [kernelparameters] [arff|fasta] inputfiles poim.png [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) - - # parse input parameters - C = float(argv[2]) - poimdegree = int(argv[3]) - (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], False) - (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) - - if len(argv_rest)<1:sys.stderr.write("poim.png [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) - if len(argv_rest)<3: - if argv_rest[-1] == 'dna': - sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") - sys.exit(-1) - elif argv_rest[-1] == 'protein': - sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") - sys.exit(-1) - else: - sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") - sys.exit(-1) - if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) - poimfilename = argv_rest[0] - seq_source = argv_rest[1] - nuc_con = argv_rest[2] - - utils.check_params(kparam, C, len(examples[0])) - - # train svm and compute POIMs - (svm, kernel, feats_train, preproc) = train(examples,labels,C,kernelname,kparam,seq_source,nuc_con) - (poim, max_poim, diff_poim, poim_totalmass) = compute_poims(svm, kernel, poimdegree, len(examples[0])) - - # plot poims - plots.plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, len(examples[0])) - diff --git a/applications/easysvm/esvm/mldata.py b/applications/easysvm/esvm/mldata.py deleted file mode 100644 index 0e75a1ddde6..00000000000 --- a/applications/easysvm/esvm/mldata.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python - -"""Classes to encapsulate the idea of a dataset in machine learning, - including file access. Currently this focuses on reading and writing - transparently to different file formats. - - A dataset is modeled as an (example,label) tuple, each of which is an array. - The base class doesn't know how to split, so just returns one array. - - The three classes currently implemented use three - different ways of iterating through files: - - CSV uses the python module csv's iterator - - ARFF always reads the whole file, and does a slice - - FASTA uses a hand crafted while loop that behaves like a generator - - The class DatasetFileARFF is in mldata-arff.py. -""" - - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -from numpy import array, concatenate -import csv -import re - -try: - import arff - have_arff = True -except ImportError: - have_arff = False - - -class DatasetFileBase(file): - """A Base class defining barebones and common behaviour - """ - - def __init__(self,filename,extype): - """Just the normal file __init__, - followed by the specific class corresponding to the file extension. - - """ - self.extype = extype - self.filename = filename - - - def readlines(self,idx=None): - """Read the lines defined by idx (a numpy array). - Default is read all lines. - - """ - if idx is None: - data = self.readlines() - else: - data = self.readlines()[idx] - #itertools.islice(open('tempx.txt'), 11, 12).next() - #file("filename").readlines()[11] - #linecache.getline( filename, lineno[, module_globals]) - return data - - def writelines(self,data,idx=None): - """Write the lines defined by idx (a numpy array). - Default is write all lines. - - data is assumed to be a numpy array. - - """ - if idx is None: - self.writelines(data) - else: - self.writelines(data[idx]) - - - -class DatasetFileCSV(DatasetFileBase): - """Comma Seperated Values file. - - Labels are in the first column. - - """ - def __init__(self,filename,extype): - DatasetFileBase.__init__(self,filename,extype) - - def readlines(self,idx=None): - """Read from file and split data into examples and labels""" - reader = csv.reader(open(self.filename,'r'), delimiter=',', quoting=csv.QUOTE_NONE) - labels = [] - examples = [] - for ix,line in enumerate(reader): - if idx is None or ix in idx: - labels.append(float(line[0])) - if self.extype == 'vec': - examples.append(array(map(float,line[1:]))) - elif self.extype == 'seq': - examples.append(line[1:][0]) - elif self.extype == 'mseq': - examples.append(array(line[1:])) - - if self.extype == 'vec': - examples = array(examples).T - print '%d features, %d examples' % examples.shape - elif self.extype == 'seq': - print 'sequence length = %d, %d examples' % (len(examples[0]),len(examples)) - elif self.extype == 'mseq': - printstr = 'sequence lengths = ' - for seq in examples[0]: - printstr += '%d, ' % len(seq) - printstr += '%d examples' % len(examples) - print printstr - - return (examples,array(labels)) - - - def writelines(self,examples,labels,idx=None): - """Merge the examples and labels and write to file""" - if idx==None: - idx = range(len(labels)) - if self.extype == 'seq': - data = zip(labels[idx],list(array(examples)[idx])) - if self.extype == 'mseq': - data = [] - for ix,curlab in enumerate(labels): - data.append([curlab]+list(examples[ix])) - elif self.extype == 'vec': - data = [] - for ix,curlab in enumerate(labels): - data.append(concatenate((array([curlab]),examples[:,ix].T))) - - fp = open(self.filename,'w') - writer = csv.writer(fp,delimiter=',',quoting=csv.QUOTE_NONE) - for ix in idx: - writer.writerow(data[ix]) - fp.close() - - - - -class DatasetFileFASTA(DatasetFileBase): - """Fasta format file, labels are in the comment after keyword 'label'. - label=1 - label=-1 - - """ - def __init__(self,filename,extype): - if extype != 'seq': - print 'Can only write fasta file for sequences!' - raise IOError - DatasetFileBase.__init__(self,filename,extype) - self.fp = None - - def readlines(self,idx=None): - """Read from file and split data into examples and labels""" - self.fp = open(self.filename,'r') - line = self.fp.readline() - - examples = [] - labels = [] - ix = 0 - while True: - if not line : break - (ex,lab,line) = self.readline(line) - if idx is None or ix in idx: - examples.append(ex) - labels.append(lab) - ix += 1 - - print 'sequence length = %d, %d examples' % (len(examples[0]),len(examples)) - return (examples,array(labels)) - - def writelines(self,examples,labels,idx=None,linelen=60): - """Write the examples and labels and write to file""" - if idx==None: - idx = range(len(labels)) - - fp = open(self.filename,'w') - for ix in idx: - fp.write('> %d label=%d\n'%(ix,round(labels[ix]))) - for lineidx in xrange(0, len(examples[ix]), linelen): - fp.write(examples[ix][lineidx:lineidx+linelen] + '\n') - fp.close() - - - def readline(self,line): - """Reads a fasta entry and returns the label and the sequence""" - if line[0] == '' : return - - assert(line[0] == '>') - # Use list comprehension to get the integer that comes after label= - a = line.split() - label = float([b.split('=')[1] for b in a if b.split('=')[0]=='label'][0]) - - lines = [] - line = self.fp.readline() - while True: - if not line : break - if line[0] == ">": break - #Remove trailing whitespace, and any internal spaces - lines.append(line.rstrip().replace(" ","")) - line = self.fp.readline() - - return (''.join(lines),label,line) - - -def init_datasetfile(filename,extype): - """A factory that returns the appropriate class based on the file extension. - - recognised file extensions - - .csv : Comma Separated Values - - .arff : Attribute-Relation File Format (weka) - - .fa : Fasta file format (seq only) - - .fasta: same as above. - - Since the file type does not determine what type of data is actually being used, - the user has to supply the example type. - - extype can be ('vec','seq','mseq') - vec - array of floats - seq - single sequence - mseq - multiple sequences - - """ - allowedtypes = ('vec','seq','mseq') - assert(extype in allowedtypes) - # map the file extensions to the relevant classes - _format2dataset = {'csv' : DatasetFileCSV, - 'fa' : DatasetFileFASTA, - 'fasta' : DatasetFileFASTA, - } - if have_arff: - from esvm.mldata_arff import DatasetFileARFF - _format2dataset['arff'] = DatasetFileARFF - - extension = detect_extension(filename) - return _format2dataset[extension](filename,extype) - - -def detect_extension(filename): - """Get the file extension""" - if filename.count('.') > 1: - print 'WARNING: %s has more than one . using last one' % filename - detect_ext = filename.split('.')[-1] - if have_arff: - known_ext = ('csv','arff','fasta','fa') - else: - known_ext = ('csv','fasta','fa') - - if detect_ext not in known_ext: - print 'WARNING: %s is an unknown file extension, defaulting to csv' % detect_ext - detect_ext = 'csv' - - if detect_ext == 'csv': - fasta_flag = 0 - arff_flag = 0 - run_c = 0 - f = open(filename,'r') - for line in f: - line = line.strip() - if re.match(r'^>',line): - fasta_flag = 1 - break - if re.match(r'^@',line): - arff_flag = 1 - break - if run_c == 5: - break - f.close() - if fasta_flag == 1: - detect_ext = 'fasta' - elif arff_flag == 1: - detect_ext = 'arff' - else: - detect_ext = 'csv' - - return detect_ext - - -def convert(infile,outfile,extype): - """Copy data from infile to outfile, possibly converting the file format.""" - fp1 = init_datasetfile(infile,extype) - (examples,labels) = fp1.readlines() - fp2 = init_datasetfile(outfile,extype) - fp2.writelines(examples,labels) - diff --git a/applications/easysvm/esvm/mldata_arff.py b/applications/easysvm/esvm/mldata_arff.py deleted file mode 100644 index ce92a560357..00000000000 --- a/applications/easysvm/esvm/mldata_arff.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python - -"""Classes to encapsulate the idea of a dataset in machine learning, - including file access. - - This file contains the ARFF class for people who have arff installed. -""" - - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -try: - import arff - have_arff = True -except ImportError: - have_arff = False - - -import sys -from numpy import array, concatenate -import csv -from esvm.mldata import DatasetFileBase - -class DatasetFileARFF(DatasetFileBase): - """Attribute-Relation File Format file, uses module arff. - - Labels are in the first column. - """ - def __init__(self,filename,extype,dataname='ARFFdata',comment=''): - """Do the base class init, then add some arff specific metadata""" - if not have_arff: - print 'import arff failed, currently cannot support ARFF file format' - return - DatasetFileBase.__init__(self,filename,extype) - self.dataname = dataname - self.comment = comment - - def readlines(self,idx=None): - """Read from file and split data into examples and labels""" - fp = open(self.filename,'r') - (dataname,issparse,alist,data) = arff.arffread(fp) - fp.close() - self.dataname = dataname - - #if (alist[0][0]!='label'): - # sys.stderr.write('First column of ARFF file needs to be the label\n') - # sys.exit(-1) - - if idx is None: - idx = range(len(data)) - - labels = [data[ix][0] for ix in idx] - labels = array(labels) - if self.extype == 'vec': - examples = [data[ix][1:] for ix in idx] - examples = array(examples).T - print '%d features, %d examples' % examples.shape - elif self.extype == 'seq': - examples = [data[ix][1] for ix in idx] - print 'sequence length = %d, %d examples' % (len(examples[0]),len(examples)) - elif self.extype == 'mseq': - examples = [data[ix][1:] for ix in idx] - printstr = 'sequence lengths = ' - for seq in examples[0]: - printstr += '%d, ' % len(seq) - printstr += '%d examples' % len(examples) - print printstr - - return (examples, labels) - - def writelines(self,examples,labels,idx=None): - """Merge the examples and labels and write to file""" - alist = [('label',1,[])] - - if idx is not None: - examples = examples[idx] - labels = labels[idx] - - if self.extype == 'vec': - data = list(concatenate((labels.reshape(len(labels),1),examples.T),axis=1)) - for ix in xrange(examples.shape[0]): - attname = 'att%d' % ix - alist.append((attname,1,[])) - elif self.extype == 'seq': - data = zip(labels,examples) - alist.append(('sequence',0,[])) - elif self.extype == 'mseq': - data = [] - for ix,curlab in enumerate(labels): - data.append([curlab]+list(examples[ix])) - alist.append(('upstream sequence',0,[])) - alist.append(('downstream sequence',0,[])) - - fp = open(self.filename,'w') - arff.arffwrite(fp,alist,data,name=self.dataname,comment=self.comment) - fp.close() - - diff --git a/applications/easysvm/esvm/parse.py b/applications/easysvm/esvm/parse.py deleted file mode 100644 index 34823255001..00000000000 --- a/applications/easysvm/esvm/parse.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -This module contains code to parse the input arguments to the command line: -- easysvm.py -- datagen.py -""" - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import datafuncs -import sys - -################################################################################ -# basic types - -def parse_range(str): - list=str.split("-") - - if len(list)==1: - return (int(list[0]), int(list[0])) - if len(list)==2: - return (int(list[0]), int(list[1])) - sys.stderr.write("Cannot parse range '%s'\n" %str) - sys.exit(-1) - - -def parse_float_list(str): - list=str.split(",") - float_list=[] ; - for elem in list: - float_list.append(float(elem)) - return float_list - - -def parse_int_list(str): - list=str.split(",") - int_list=[] ; - for elem in list: - int_list.append(int(elem)) - return int_list - - -################################################################################ -# input files - -def parse_input_file_train(kernelname, argv): - """Parse the input and output file names""" - - if len(argv)<2 or (argv[0]=="fasta" and len(argv)<3) or (argv[0]!='fasta' and argv[0]!='arff'): - sys.stderr.write("data usage: arff \n or: fasta \n") - sys.exit(-1) - - if argv[0] == 'fasta': - datafilenamepos = argv[1] - datafilenameneg = argv[2] - (examples, labels) = datafuncs.fastaread(datafilenamepos, datafilenameneg) - argv_rest=argv[3:] - elif argv[0] == 'arff': - datafilename = argv[1] - (examples, labels) = datafuncs.arffread(kernelname, datafilename) - argv_rest=argv[2:] - else: - print 'Error in parse_input_file' - - return (examples,labels,argv_rest) - - -def parse_input_file_train_test(kernelname, argv): - """Parse the input and output file names""" - - if len(argv)<3 or (argv[0]=="fasta" and len(argv)<4) or (argv[0]!='fasta' and argv[0]!='arff'): - sys.stderr.write("data usage: arff \n or: fasta \n") - sys.exit(-1) - - if argv[0] == 'fasta': - datafilenamepos = argv[1] - datafilenameneg = argv[2] - datafilenametest = argv[3] - (trainex, trainlab) = datafuncs.fastaread(datafilenamepos, datafilenameneg) - (testex, testlab) = datafuncs.fastaread(datafilenametest) - argv_rest=argv[4:] - elif argv[0] == 'arff': - datafilename = argv[1] - datafilenametest = argv[2] - (trainex, trainlab) = datafuncs.arffread(kernelname, datafilename) - (testex, testlab) = datafuncs.arffread(kernelname, datafilenametest) - argv_rest=argv[3:] - else: - print 'Error in parse_input_file' - - return (trainex,trainlab,testex,argv_rest) - -################################################################################ -# prediction file - -def parse_prediction_file(fname): - outputs=[] - splitassignments=[] - - f = open(fname) - str=f.read() - lines = str.split('\n') - num=0 - for line in lines: - if len(line)>0 and line[0] != '#': - elems=line.split('\t') - assert(len(elems)>1) - assert(int(elems[0]) == num) - num+=1 - if len(elems)==2: - outputs.append(float(elems[1])) - else: - assert(len(elems)==3) - outputs.append(float(elems[1])) - splitassignments.append(float(elems[2])) - f.close() - if len(splitassignments)==0: - splitassignments = None - - return (outputs, splitassignments) - -################################################################################ -# kernel parameters - -def parse_kernel_param(argv, allow_modelsel_params): - """Parse the arguments for a particular kernel""" - - if len(argv)<1: - sys.stderr.write("kernel usage: []\n") - sys.exit(-1) - - kernelname = argv[0] - kparam = {} - kparam["name"]=kernelname - kparam["modelsel_name"]=None - kparam["modelsel_params"]=None - - if kernelname == 'gauss': - if len(argv)<2: - sys.stderr.write("kernel usage: gauss \n") - sys.exit(-1) - if allow_modelsel_params: - kparam['width'] = None - kparam["modelsel_name"]="width" - kparam["modelsel_params"]=parse_float_list(argv[1]) - else: - kparam['width'] = float(argv[1]) - argv_rest=argv[2:] - elif kernelname == 'linear': - kparam['scale']=1 - # no parameters - argv_rest=argv[1:] - elif kernelname == 'poly': - if len(argv)<4: - sys.stderr.write("kernel usage: poly \n") - sys.exit(-1) - if allow_modelsel_params: - kparam['degree'] = None - kparam["modelsel_name"]="degree" - kparam["modelsel_params"]=parse_int_list(argv[1]) - else: - kparam['degree'] = int(argv[1]) - kparam['inhomogene'] = (argv[2] == 'true') - kparam['normal'] = (argv[3] == 'true') - argv_rest=argv[4:] - elif kernelname == 'wd': - if len(argv)<3: - sys.stderr.write("kernel usage: wd \n") - sys.exit(-1) - if allow_modelsel_params: - kparam['degree'] = None - kparam["modelsel_name"]="degree" - kparam["modelsel_params"]=parse_int_list(argv[1]) - else: - kparam['degree'] = int(argv[1]) - if allow_modelsel_params and len(kparam["modelsel_params"])==1: - kparam['degree'] = kparam["modelsel_params"][0] - kparam['shift'] = None - kparam["modelsel_name"] = "shift" - kparam["modelsel_params"]=parse_int_list(argv[2]) - else: - kparam['shift'] = int(argv[2]) - argv_rest=argv[3:] - elif kernelname == 'spec': - if len(argv)<2: - sys.stderr.write("kernel usage: spec \n") - sys.exit(-1) - if allow_modelsel_params: - kparam['degree'] = None - kparam["modelsel_name"]="degree" - kparam["modelsel_params"]=parse_int_list(argv[1]) - else: - kparam['degree'] = int(argv[1]) - argv_rest=argv[2:] - elif kernelname == 'localalign': - # no parameters - argv_rest=argv[1:] - elif kernelname == 'localimprove': - if len(argv)<4: - sys.stderr.write("kernel usage: localimprove \n") - sys.exit(-1) - kparam['length'] = int(argv[1]) - if allow_modelsel_params: - kparam['width'] = None - kparam["modelsel_name"]="indeg" - kparam["modelsel_params"]=parse_int_list(argv[2]) - else: - kparam['indeg'] = int(argv[2]) - kparam['outdeg'] = int(argv[3]) - argv_rest=argv[4:] - else: - sys.stderr.write( 'Unknown kernel name %s in parse_kernel_param\n' % kernelname ) - sys.exit(-1) - - return kernelname,kparam,argv_rest - diff --git a/applications/easysvm/esvm/plots.py b/applications/easysvm/esvm/plots.py deleted file mode 100644 index 98e7322e234..00000000000 --- a/applications/easysvm/esvm/plots.py +++ /dev/null @@ -1,226 +0,0 @@ -""" -This module contains code for commonly used plots -""" - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import random -import numpy -import warnings -import shutil - -from shogun import Labels -from shogun import * - -def plotroc(output, LTE, draw_random=False, figure_fname="", roc_label='ROC'): - """Plot the receiver operating characteristic curve""" - import pylab - import matplotlib - - pylab.figure(1,dpi=150,figsize=(4,4)) - fontdict=dict(family="cursive",weight="bold",size=7,y=1.05) ; - - pm=PerformanceMeasures(Labels(numpy.array(LTE)), Labels(numpy.array(output))) - - points=pm.get_ROC() - points=numpy.array(points).T # for pylab.plot - pylab.plot(points[0], points[1], 'b-', label=roc_label) - if draw_random: - pylab.plot([0, 1], [0, 1], 'r-', label='random guessing') - pylab.axis([0, 1, 0, 1]) - ticks=numpy.arange(0., 1., .1, dtype=numpy.float64) - pylab.xticks(ticks,size=10) - pylab.yticks(ticks,size=10) - pylab.xlabel('1 - specificity (false positive rate)',size=10) - pylab.ylabel('sensitivity (true positive rate)',size=10) - pylab.legend(loc='lower right', prop = matplotlib.font_manager.FontProperties('tiny')) - - if figure_fname!=None: - warnings.filterwarnings('ignore','Could not match*') - tempfname = figure_fname + '.png' - pylab.savefig(tempfname) - shutil.move(tempfname,figure_fname) - - auROC=pm.get_auROC() - return auROC ; - -def plotprc(output, LTE, figure_fname="", prc_label='PRC'): - """Plot the precision recall curve""" - import pylab - import matplotlib - - pylab.figure(2,dpi=150,figsize=(4,4)) - - pm=PerformanceMeasures(Labels(numpy.array(LTE)), Labels(numpy.array(output))) - - points=pm.get_PRC() - points=numpy.array(points).T # for pylab.plot - pylab.plot(points[0], points[1], 'b-', label=prc_label) - pylab.axis([0, 1, 0, 1]) - ticks=numpy.arange(0., 1., .1, dtype=numpy.float64) - pylab.xticks(ticks,size=10) - pylab.yticks(ticks,size=10) - pylab.xlabel('sensitivity (true positive rate)',size=10) - pylab.ylabel('precision (1 - false discovery rate)',size=10) - pylab.legend(loc='lower right') - - if figure_fname!=None: - warnings.filterwarnings('ignore','Could not match*') - tempfname = figure_fname + '.png' - pylab.savefig(tempfname) - shutil.move(tempfname,figure_fname) - - auPRC=pm.get_auPRC() - return auPRC ; - -def plotcloud(cloud, figure_fname="", label='cloud'): - """Plot the cloud of points (the first two dimensions only)""" - import pylab - import matplotlib - - pylab.figure(1,dpi=150,figsize=(4,4)) - - pos = [] - neg = [] - for i in xrange(len(cloud)): - if cloud[i][0]==1: - pos.append(cloud[i][1:]) - elif cloud[i][0]==-1: - neg.append(cloud[i][1:]) - - fontdict=dict(family="cursive",weight="bold",size=10,y=1.05) ; - pylab.title(label, fontdict) - points=numpy.array(pos).T # for pylab.plot - pylab.plot(points[0], points[1], 'b+', label='positive') - points=numpy.array(neg).T # for pylab.plot - pylab.plot(points[0], points[1], 'rx', label='negative') - #pylab.axis([0, 1, 0, 1]) - #ticks=numpy.arange(0., 1., .1, dtype=numpy.float64) - #pylab.xticks(ticks,size=10) - #pylab.yticks(ticks,size=10) - pylab.xlabel('dimension 1',size=10) - pylab.ylabel('dimension 2',size=10) - pylab.legend(loc='lower right') - - if figure_fname!=None: - warnings.filterwarnings('ignore','Could not match*') - tempfname = figure_fname + '.png' - pylab.savefig(tempfname) - shutil.move(tempfname,figure_fname) - -def plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, max_len): - """Plot a summary of the information in poims""" - import pylab - import matplotlib - - pylab.figure(3, dpi=150, figsize=(4,5)) - - # summary figures - fontdict=dict(family="cursive",weight="bold",size=7,y=1.05) ; - pylab.subplot(3,2,1) - pylab.title('Total POIM Mass', fontdict) - pylab.plot(poim_totalmass) ; - pylab.ylabel('weight mass', size=5) - - pylab.subplot(3,2,3) - pylab.title('POIMs', fontdict) - pylab.pcolor(max_poim, shading='flat') ; - - pylab.subplot(3,2,5) - pylab.title('Differential POIMs', fontdict) - pylab.pcolor(diff_poim, shading='flat') ; - - for plot in [3, 5]: - pylab.subplot(3,2,plot) - ticks=numpy.arange(1., poimdegree+1, 1, dtype=numpy.float64) - ticks_str = [] - for i in xrange(0, poimdegree): - ticks_str.append("%i" % (i+1)) - ticks[i] = i + 0.5 - pylab.yticks(ticks, ticks_str) - pylab.ylabel('degree', size=5) - - # per k-mer figures - fontdict=dict(family="cursive",weight="bold",size=7,y=1.04) ; - - # 1-mers - pylab.subplot(3,2,2) - pylab.title('1-mer Positional Importance', fontdict) - pylab.pcolor(poim[0], shading='flat') ; - ticks_str = ['A', 'C', 'G', 'T'] - ticks = [0.5, 1.5, 2.5, 3.5] - pylab.yticks(ticks, ticks_str, size=5) - pylab.axis([0, max_len, 0, 4]) - - # 2-mers - pylab.subplot(3,2,4) - pylab.title('2-mer Positional Importance', fontdict) - pylab.pcolor(poim[1], shading='flat') ; - i=0 ; - ticks=[] ; - ticks_str=[] ; - for l1 in ['A', 'C', 'G', 'T']: - for l2 in ['A', 'C', 'G', 'T']: - ticks_str.append(l1+l2) - ticks.append(0.5+i) ; - i+=1 ; - pylab.yticks(ticks, ticks_str, fontsize=5) - pylab.axis([0, max_len, 0, 16]) - - # 3-mers - pylab.subplot(3,2,6) - pylab.title('3-mer Positional Importance', fontdict) - pylab.pcolor(poim[2], shading='flat') ; - i=0 ; - ticks=[] ; - ticks_str=[] ; - for l1 in ['A', 'C', 'G', 'T']: - for l2 in ['A', 'C', 'G', 'T']: - for l3 in ['A', 'C', 'G', 'T']: - if numpy.mod(i,4)==0: - ticks_str.append(l1+l2+l3) - ticks.append(0.5+i) ; - i+=1 ; - pylab.yticks(ticks, ticks_str, fontsize=5) - pylab.axis([0, max_len, 0, 64]) - - # x-axis on last two figures - for plot in [5, 6]: - pylab.subplot(3,2,plot) - pylab.xlabel('sequence position', size=5) - - - # finishing up - for plot in xrange(0,6): - pylab.subplot(3,2,plot+1) - pylab.xticks(fontsize=5) - - for plot in [1,3,5]: - pylab.subplot(3,2,plot) - pylab.yticks(fontsize=5) - - pylab.subplots_adjust(hspace=0.35) ; - - # write to file - warnings.filterwarnings('ignore','Could not match*') - pylab.savefig('/tmp/temppylabfig.png') - shutil.move('/tmp/temppylabfig.png',poimfilename) - diff --git a/applications/easysvm/esvm/poim.py b/applications/easysvm/esvm/poim.py deleted file mode 100644 index f50fd6a426d..00000000000 --- a/applications/easysvm/esvm/poim.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -This module contains code for computing -Position Oligomer Importance Matrices -""" - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import numpy -from numpy import ones - - -def compute_poims(svm, kernel, poimdegree, max_len): - """For a trained SVM, compute Position Oligomer Importance Matrices""" - - distr = ones((max_len,4))/4 ; - kernel.prepare_POIM2(distr) - - kernel.compute_POIM2(poimdegree, svm) ; - poim = kernel.get_POIM2() - kernel.cleanup_POIM2() - - (poim, max_poim, diff_poim) = reshape_normalize_contribs(poim, poimdegree, max_len) - (poim_weightmass, poim_totalmass) = compute_weight_mass(poim, poimdegree, max_len) - - poim_totalmass=poim_totalmass/numpy.sum(poim_totalmass) - - return (poim, max_poim, diff_poim, poim_totalmass) - - -def compute_weight_mass(C, maxOrder, seqLen): - - mass=numpy.zeros((maxOrder, seqLen), numpy.double); - total=numpy.zeros((1, seqLen), numpy.double); - for i in xrange(0,maxOrder): - mass[i,:] = sum(numpy.abs(C[i])) - total = sum(mass); - - return (mass,total) - -def getstringprobsMC(maxOrder,distrib,length, abcSize): - - pmatrix = [] - for k in xrange(0,maxOrder): - pmatrix.append(ones(4^k,len)) - - for l in xrange(0,len): - for sigma in xrange(0, abcSize): - prob = distrib(sigma,l); - for k in xrange(0, maxOrder): - for relpos in xrange(0, min(k,l)): - vi = genindexvector_spos(k,sigma-1,relpos,abcSize); - pmatrix[k][vi,l-relpos+1] = pmatrix[k][vi,l-relpos+1]*prob; - - return pmatrix - -def getV2_poimMC(u, strprobs, abcSize): - VV = []; - for k in xrange(0, len(u)): - m = abcSize^k; - VV.append( numpy.ones(4**(k+1),1)*mean(u[k]*strprobs[k] ) ) - - return VV - -def reshape_normalize_contribs(C, maxOrder, seqLen, opts={}): - - alphabetSize = 4; - Contribs = [] ; - l=0; - for i in xrange(0, maxOrder): - L = l + (alphabetSize**(i+1)) * seqLen; - vec=C[l:L].copy() ; - Contribs.append(vec.reshape( seqLen, alphabetSize**(i+1) ).T) ; - l = L; - - assert( l == len(C) ); - - if opts.has_key("distribution"): - strprobs = getstringprobsMC(length(Contribs), opts["distribution"], seqLen, 4); - MyV2 = getV2_poimMC(Contribs, strprobs, seqLen, 4); - - for i in xrange(0, maxOrder ): - Contribs[i] = Contribs[i] -MyV2[i]; - - if opts.has_key("background"): - for i in xrange(0, maxOrder ): - Contribs[i] = Contribs[i]*(opts["background"][i]!=0); - - maxContribs = numpy.zeros( (maxOrder, seqLen), numpy.double ); - maxp_org = numpy.zeros( (maxOrder, seqLen), numpy.double ); - maxp_str= numpy.zeros( (maxOrder, seqLen), numpy.int ); - for i in xrange(0, maxOrder ): - con=numpy.abs(Contribs[i]) ; - maxContribs[i,:] = numpy.max(con, axis=0) - maxp_str[i,:] = numpy.argmax(con, axis=0) - - diffmaxContribs = numpy.zeros( (maxOrder, seqLen), numpy.double ); - - for k in xrange(1, maxOrder ): - numsy=4**(k+1); - for l in xrange(0, seqLen-k): - km=maxp_str[k,l] ; - A=numpy.abs(Contribs[k-1][numpy.floor(km/4),l]); - B=numpy.abs(Contribs[k-1][numpy.mod(km,numsy/4),l+1]); - #zA=numpy.mod(km,4)+1; - #zB=numpy.floor(km/(numsy/4))+1; - #correction=sum([A/distribution(zA, l+k-1), B/distribution(zB, l)]); - correction=numpy.max([A, B]); - diffmaxContribs[k,l] = maxContribs[k,l] - correction; - - return (Contribs, maxContribs, diffmaxContribs) - diff --git a/applications/easysvm/esvm/utils.py b/applications/easysvm/esvm/utils.py deleted file mode 100644 index dfdefa24456..00000000000 --- a/applications/easysvm/esvm/utils.py +++ /dev/null @@ -1,180 +0,0 @@ -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import random -import numpy -import warnings -import shutil - -from shogun import Labels -from shogun import * - -################################################################################ -# evaluation functions - -def confusionMatrix(labels_test, labels_predicted): - """Compute the matrix of predictions versus labels""" - if len(labels_test) != len(labels_predicted): - return 0 - TP = 0; FP = 0; TN = 0; FN = 0 - for i in range(0, len(labels_test)): - if labels_test[i] == 0 or labels_predicted[i] == 0: - return 0 - if labels_test[i] > 0: - if labels_predicted[i] > 0: TP += 1 - else: FN +=1 - else: - if labels_predicted[i] > 0: FP += 1 - else: TN += 1 - return (TP, TN, FP, FN) - -def accuracy(output, labels_test): - """How many correct predictions?""" - TP, TN, FP, FN = confusionMatrix(labels_test, numpy.sign(output)) - return float(TP + TN) / (TP + TN + FP + FN) - -def calcroc(output, LTE): - """The area under the receiver operating characteristic curve""" - pm=ROCEvaluation() - pm.evaluate(Labels(numpy.array(output)), Labels(numpy.array(LTE))) - - auROC=pm.get_auROC() - return auROC - -def calcprc(output, LTE): - """The area under the precision recall curve""" - pm=PRCEvaluation() - pm.evaluate(Labels(numpy.array(output)), Labels(numpy.array(LTE))) - - auPRC=pm.get_auPRC() - return auPRC - - -def calcperf(output, LTE, perflist): - """Compute all the performance measures in perflist""" - resperf = [] - for perf in perflist: - resperf.append(apply(perf,(output,LTE))) - - return resperf - - -################################################################################ -# splitting functions - -def getPartitionedSet(total, crossval_repeat, seed=None): - """Generate a list of indices, splitting the dataset""" - if seed==None: - random.seed(123456789) - else: - random.seed(seed) - - size = int(total / crossval_repeat) - mod = total % crossval_repeat - - splits = [] - for i in range(0, crossval_repeat): - if i < mod: - splits.append(size + 1) - else: - splits.append(size) - - ipartition = random.sample(xrange(0,total), total) # random sampling - - index = 0 - partitions = [] - - for size in splits: - partitions.append(ipartition[index:index+size]) - index += size - - return partitions - - -def getCurrentSplit(repetition, partitions, labels, seqs): - """Split the data into training and test sets""" - X = []; Y = []; XT = []; YT = [] - for i in range(0, len(partitions)): - if type(seqs) == type(list([])): - for j in range(0, len(partitions[i])): - if repetition != i: - X.append(seqs[partitions[i][j]]) - Y.append(labels[partitions[i][j]]) - else: - XT.append(seqs[partitions[i][j]]) - YT.append(labels[partitions[i][j]]) - else: - if repetition != i: - if len(X) == 0: - X = seqs.take(partitions[i],axis=1) - Y = labels.take(partitions[i]) - else: - X = numpy.concatenate((X,seqs.take(partitions[i],axis=1)),axis=1) - Y = numpy.concatenate((Y,labels.take(partitions[i]))) - else: - XT = seqs.take(partitions[i],axis=1) - YT = labels.take(partitions[i]) - - return X, Y, XT, YT - -################################################################################ - -def check_params(params, C, max_len): - """Check for validity of parameters""" - if (C<=0): - sys.stderr.write( "\nerror: the parameter 'C' has to be larger than 0\n" ) - assert(C>0) - - if params.has_key("degree"): - if (params["degree"]<=0): - sys.stderr.write( "\nerror: the parameter 'degree' has to be larger than 0\n" ) - assert(params["degree"]>0) - - if params.has_key("width"): - print params["width"] - if (params["width"]<=0): - sys.stderr.write( "\nerror: the parameter 'width' has to be larger than 0\n" ) - assert(params["width"]>0) - - if params.has_key("shift"): - if (params["shift"]<0) or (params["shift"]>max_len): - sys.stderr.write( "\nerror: the parameter 'shift' has to be larger than 0 and smaller than %i\n" % max_len ) - assert((params["shift"]>=0) and (params["shift"]<=max_len)) - - if params.has_key("poim_degree"): - if params["poim_degree"]>8: - sys.stderr.write( "\nerror: the parameter 'poim_degree' has to be smaller than 8\n" ) - assert(params["poim_degree"]<=8) - - if params.has_key("crossval_repeat"): - if params["crossval_repeat"]<1: - sys.stderr.write( "\nerror: number of cross-validation repeats has to be larger than one\n" ) - assert(params["crossval_repeat"]>1) - - if params.has_key("inhomogene"): - if params["inhomogene"]!=True and params["inhomogene"]!=False: - sys.stderr.write( "\nerror: the parameter 'inhomogene' has to be True or False\n" ) - assert(params["inhomogene"]==True or params["inhomogene"]==False) - - if params.has_key("normal"): - if params["normal"]!=True and params["normal"]!=False: - sys.stderr.write( "\nerror: the parameter 'normal' has to be True or False\n" ) - assert(params["normal"]==True or params["normal"]==False) - diff --git a/applications/easysvm/galaxy/CloudGen.xml b/applications/easysvm/galaxy/CloudGen.xml deleted file mode 100644 index fe064b132fe..00000000000 --- a/applications/easysvm/galaxy/CloudGen.xml +++ /dev/null @@ -1,53 +0,0 @@ - - Generation of a toy data set - datagen.py cloud $number_of_examples - $number_of_features $fraction_positive $spread $outfile_arff $outfile_png - - - - - - - - - - - - - - - - - - - - - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/FastaGen.xml b/applications/easysvm/galaxy/FastaGen.xml deleted file mode 100644 index 8189e31cc82..00000000000 --- a/applications/easysvm/galaxy/FastaGen.xml +++ /dev/null @@ -1,52 +0,0 @@ - - Generation of a sequence toy data set for motif discovery in FASTA format. - datagen.py motif fasta $p_motif - $p_number_of_sequences $p_length $p_position - $p_mutation_rate $outfile_fasta - - - - - - - - - - - - - - - - - - - - - - - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/MotifGen.xml b/applications/easysvm/galaxy/MotifGen.xml deleted file mode 100644 index 43934ed4a8e..00000000000 --- a/applications/easysvm/galaxy/MotifGen.xml +++ /dev/null @@ -1,91 +0,0 @@ - - Generation of a sequence toy data set for motif discovery in ARFF format. - datagen.py motif arff $p_motif - $p_number_of_sequences $p_length $p_position - $p_mutation_rate - #if $negative_model.negseq=="1" - T $negative_model.n_number_of_sequences $negative_model.n_length 1 1 - #else - $negative_model.n_motif - $negative_model.n_number_of_sequences $negative_model.n_length $negative_model.n_position - $negative_model.n_mutation_rate - #end if - $outfile_arff - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/README b/applications/easysvm/galaxy/README deleted file mode 100644 index 1e7b2566082..00000000000 --- a/applications/easysvm/galaxy/README +++ /dev/null @@ -1,3 +0,0 @@ -The files in this directory are a copy of -svn/projects/galaxy/tools/agr. If you edit them, make sure the -changes are also integrated into the main version. diff --git a/applications/easysvm/galaxy/easysvm.xml b/applications/easysvm/galaxy/easysvm.xml deleted file mode 100644 index cf147c8e711..00000000000 --- a/applications/easysvm/galaxy/easysvm.xml +++ /dev/null @@ -1,316 +0,0 @@ - - -Train SVMs and predict for given hyper-parameters - easysvm.py - #if $protocol.expt_type=="1" - cv $protocol.crossval - #elif $protocol.expt_type=="2" - modelsel $protocol.crossval - #elif $protocol.expt_type=="3" - pred - #end if - $C - #if $kernel.kname == "1" - linear - #elif $kernel.kname == "2" - poly ${kernel.degree} ${kernel.inhomogene} true - #elif $kernel.kname == "3" - gauss ${kernel.width} - #elif $kernel.kname == "4" - spec ${kernel.degree} - #elif $kernel.kname == "5" - wd ${kernel.degree} ${kernel.shift} - #end if - #if $protocol.expt_type=="1" - #if $protocol.datatype.inputdatatype=="1" - fasta $protocol.datatype.dataset_pos $protocol.datatype.dataset_neg - #else - arff $protocol.datatype.dataset - #end if - #elif $protocol.expt_type=="2" - #if $protocol.datatype.inputdatatype=="1" - fasta $protocol.datatype.dataset_pos $protocol.datatype.dataset_neg - #else - arff $protocol.datatype.dataset - #end if - #else - #if $protocol.datatype.inputdatatype=="1" - fasta $protocol.datatype.dataset_pos $protocol.datatype.dataset_neg - #else - arff $protocol.datatype.dataset - #end if - $protocol.datatype.dataset_test - #end if - $outfile - #if $kernel.kname=="4" or $kernel.kname=="5" - #if $kernel.seq.stype=="dna" - dna $kernel.seq.con - #elif $kernel.seq.stype=="protein" - protein $kernel.seq.con - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - - The SVM will classify examples according to their features and - report the classes to which they belong to for every entry in the - training set or the test set. The cross validation (CV) procedure - splits the data in a number of (approximately) equal sized sets - (controlled by the CV repeat parameter). Then, it holds out each - of those sets in turn for validation (prediction with the SVM), - while using all the other sets for training. - -.. class:: warningmark - - Please note that the - total computation time is proportional to the number of - cross-validation rounds. - -.. class:: infomark - - **TIP:** The result table with the individual classification for every - example. To find optimal settings for C and the kernel - parameters, check the prediction performance with *SVM Toolbox->Evaluate Predictions* - and adjust in either direction to find a good setting. Repeat - several times if necessary. Alternatively, use *SVM Toolbox->Model Selection*. - ------ - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/eval.xml b/applications/easysvm/galaxy/eval.xml deleted file mode 100644 index b7d7d8bc0e1..00000000000 --- a/applications/easysvm/galaxy/eval.xml +++ /dev/null @@ -1,85 +0,0 @@ - - -Use predictions and labeled examples to measure prediction performance - easysvm.py - eval - $predfile - #if $datatype.inputdatatype=="1" - fasta $datatype.dataset_pos $datatype.dataset_neg - #else - arff $datatype.dataset - #end if - $outfile $figuretype $figurefile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - - This tool takes a set of predictions and the a labeled dataset as input - and computes several performance measures. Only the labels are taken - into account for evaluation. Two output files are generated: a text - summary and either an ROC or PRC curve. - -.. class:: warningmark - - Make sure the prediction file matches the dataset. - -.. class:: infomark - - **TIP:** To optimize the SVM hyper-parameters, you may also use *SVM Toolbox->Model Selection*. - ----- - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/modelsel.xml b/applications/easysvm/galaxy/modelsel.xml deleted file mode 100644 index 96b07a3963e..00000000000 --- a/applications/easysvm/galaxy/modelsel.xml +++ /dev/null @@ -1,251 +0,0 @@ - - -Find the best combination of SVM hyper-parameters - easysvm.py - modelsel $crossval - $C - #if $kernel.kname == "1" - linear - #elif $kernel.kname == "2" - poly ${kernel.degree} ${kernel.inhomogene} true - #elif $kernel.kname == "3" - gauss ${kernel.width} - #elif $kernel.kname == "4" - spec ${kernel.degree} - #elif $kernel.kname == "5" - wd ${kernel.degree} ${kernel.shift} - #end if - #if $datatype.inputdatatype=="1" - fasta $datatype.dataset_pos $datatype.dataset_neg - #else - arff $datatype.dataset - #end if - $outfile - #if $kernel.kname=="4" or $kernel.kname=="5" - #if $kernel.seq.stype=="dna" - dna $kernel.seq.con - #elif $kernel.seq.stype=="protein" - protein $kernel.seq.con - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - - For every parameter combination of C and kernel parameters - the performance is estimated using cross validation. The result is a list of - optimal parameter combinations for several performance measures. - -.. class:: warningmark - - Please note that the - total computation time scales with number of - cross-validation rounds and parameter combinations. - ----- - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/galaxy/poim.xml b/applications/easysvm/galaxy/poim.xml deleted file mode 100644 index 840af860ad0..00000000000 --- a/applications/easysvm/galaxy/poim.xml +++ /dev/null @@ -1,106 +0,0 @@ - - -Visualize SVMs with WD kernel - easysvm.py - poim - $C - ${poimdegree} - wd ${degree} ${shift} - #if $datatype.inputdatatype=="1" - fasta $datatype.dataset_pos $datatype.dataset_neg - #else - arff $datatype.dataset - #end if - $poimfile - #if $seq.stype=="dna" - dna $seq.con - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - - This tool trains an SVM with Weighted Degree kernel and computes - the so-called Positional Oligomer Importance Matrices that can be used - visualize features describing the decision boundary of the learned classifier. - ----- - - This tool is part of the MLB Galaxy package, adding some machine - learning functionality to PSU's Galaxy framework. Copyright (C) - 2008 Sebastian J. Schultheiss (sebi@umich.edu), Gunnar Raetsch - (raetsch@tuebingen.mpg.de) and Cheng Soon Ong (chengsoon.ong@tuebingen.mpg.de) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, see http://www.gnu.org/licenses - or write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301 USA - - - - - diff --git a/applications/easysvm/scripts/datagen.py b/applications/easysvm/scripts/datagen.py deleted file mode 100644 index d65c6257ad0..00000000000 --- a/applications/easysvm/scripts/datagen.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import random -from numpy import array -import esvm.parse -import esvm.plots -from esvm.datafuncs import MotifDataDef, fastawrite_sequence, arffwrite_sequence, arffwrite_real -from esvm.mldata import init_datasetfile - -if __name__ == '__main__': - - if len(sys.argv)<3 or (sys.argv[1]=='motif' and sys.argv[2]!='arff' and sys.argv[2]!='fasta') \ - or (sys.argv[1]=='motif' and sys.argv[2]=='fasta' and len(sys.argv)<9) \ - or (sys.argv[1]=='motif' and sys.argv[2]=='arff' and len(sys.argv)<14) \ - or (sys.argv[1]=='cloud' and len(sys.argv)<7) or (sys.argv[1]!='motif') \ - and (sys.argv[1]!='cloud'): - sys.stderr.write( "usage: %s motif fasta MOTIF numSeq seqLenRange"+\ - "positionRange mutationRate output.fa\n"+\ - "or: %s motif arff MOTIFPOS numSeq-pos seqLenRange-pos "+\ - "positionRange-pos mutationRate-pos \\\n"+\ - "motif-neg numSeq-neg seqLenRange-neg positionRange-neg "+\ - "mutationRange-neg output.arff\n"+\ - "or: %s cloud numpoints dimensions fractionOfPositives "+\ - "cloudWidth output.arff\n" % (sys.argv[0],sys.argv[0],sys.argv[0]) ) - sys.exit(-1) - - random.seed() - - if sys.argv[1] == 'motif': - if sys.argv[2]=='fasta': - # generate sequences in FASTA format - p = MotifDataDef() - p.motif = sys.argv[3] - p.numseq = int(sys.argv[4]) - (p.seqlenmin,p.seqlenmax) = esvm.parse.parse_range(sys.argv[5]) - (p.posstart,p.posend) = esvm.parse.parse_range(sys.argv[6]) - p.mutrate = float(sys.argv[7]) - - filename = sys.argv[8] - fastawrite_sequence(filename, p) - - else: - # generate sequences in ARFF format - assert(sys.argv[2]=='arff') - p = MotifDataDef() - p.motif = sys.argv[3] - p.numseq = int(sys.argv[4]) - (p.seqlenmin,p.seqlenmax) = esvm.parse.parse_range(sys.argv[5]) - (p.posstart,p.posend) = esvm.parse.parse_range(sys.argv[6]) - p.mutrate = float(sys.argv[7]) - - n = MotifDataDef() - n.motif = sys.argv[8] - n.numseq = int(sys.argv[9]) - (n.seqlenmin,n.seqlenmax) = esvm.parse.parse_range(sys.argv[10]) - (n.posstart,n.posend) = esvm.parse.parse_range(sys.argv[11]) - n.mutrate = float(sys.argv[12]) - - filename = sys.argv[13] - arffwrite_sequence(filename, p, n) - - elif sys.argv[1] == 'cloud': - # generate a data cloud in ARFF format - numpoint = int(sys.argv[2]) - numfeat = int(sys.argv[3]) - fracpos = float(sys.argv[4]) - width = float(sys.argv[5]) - - filename = sys.argv[6] - arffwrite_real(filename, numpoint, numfeat, fracpos, width) - if len(sys.argv)>=8: - fp = init_datasetfile(filename,'vec') - (examples,labels) = fp.readlines() - pointcloud = [] - for ix in xrange(numpoint): - pointcloud.append(array([labels[ix],examples[0,ix],examples[1,ix]])) - esvm.plots.plotcloud(pointcloud,sys.argv[7],'Pointcloud') - - #(examples,labels,metadata)=arffwrite_real(filename, numpoint, numfeat, fracpos, width) - #if len(sys.argv)>=8: - # plots.plotcloud(pointcloud,sys.argv[7],metadata) - else: - print 'Unknown option %s\n' % sys.argv[1] diff --git a/applications/easysvm/scripts/easysvm.py b/applications/easysvm/scripts/easysvm.py deleted file mode 100644 index 7566201d125..00000000000 --- a/applications/easysvm/scripts/easysvm.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -import random -from esvm.experiment import svm_cv, svm_pred, svm_poim, svm_eval, svm_modelsel - -if __name__ == '__main__': - - if len(sys.argv)<2: - sys.stderr.write("usage: %s [cv|pred|modelsel|eval|poim] parameters\n" % sys.argv[0]) - sys.exit(-1) - - random.seed() - - topmode = sys.argv[1] - - if topmode == 'cv': - svm_cv(sys.argv) - elif topmode == 'pred': - svm_pred(sys.argv) - elif topmode == 'poim': - svm_poim(sys.argv) - elif topmode == 'eval': - svm_eval(sys.argv) - elif topmode == 'modelsel': - svm_modelsel(sys.argv) - else: - sys.stderr.write( "unknown mode %s (use: cv, pred, poim, eval)\n" % topmode) - sys.exit(-1) - - sys.exit(0) - diff --git a/applications/easysvm/setup.py b/applications/easysvm/setup.py deleted file mode 100755 index 774c111707b..00000000000 --- a/applications/easysvm/setup.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import sys -from distutils.core import setup, Extension - - -sys.stdout.write('-------------------------------------------\n') -sys.stdout.write('easysvm - A front end to the shogun toolbox\n') -sys.stdout.write('-------------------------------------------\n\n') - - -try: - import numpy -except: - sys.stderr.write("WARNING: did not find 'numpy'\n") - -try: - import pylab -except: - sys.stderr.write("WARNING: did not find 'pylab'\n") - -try: - import shogun -except: - sys.stderr.write("WARNING: did not find 'shogun'\n") - sys.stderr.write(" shogun is a required back end.\n") - sys.stderr.write(" See shogun website: http://www.shogun-toolbox.org)\n") - -try: - import arff -except: - sys.stderr.write("WARNING: did not find 'arff'\n") - sys.stderr.write(" arff is required for reading and writing ARFF data files\n") - sys.stderr.write(" See arff website: http://www.mit.edu/~sav/arff/\n\n") - sys.stderr.write(" All other functionality should be ok.\n\n") - - - -setup (name = 'easysvm', - version = '0.3.3', - description = 'easysvm - A front end to the shogun toolbox', - author = ['Cheng Soon Ong', 'Gunnar Raetsch' ], - author_email = ['chengsoon.ong@tuebingen.mpg.de','gunnar.raetsch@tuebingen.mpg.de'], - license='GPLv3', - url = 'http://www.fml.tuebingen.mpg.de/raetsch/projects/easysvm', - py_modules=['esvm.datafuncs','esvm.mldata','esvm.mldata_arff','esvm.experiment','esvm.parse',\ - 'esvm.plots','esvm.poim','esvm.utils','splicesites.utils'], - packages=['esvm','splicesites'], - scripts=['scripts/easysvm.py','scripts/datagen.py'], - long_description=""" - easysvm is a front end to the shogun toolbox. It aims to be a - 'quick start' tutorial for users interested in shogun. As such, - it covers only the absolute basics of machine learning. - More advanced users should directly use the interfaces provided - by shogun. - - This setup.py installs two modules: - - esvm (which contains some basic tools for machine learning with SVMs) - - splicesites (which contains tools for splice site prediction) - - It also installs two scripts (easysvm.py and datagen.py) in the relevant binary directory. - The two scripts provide a command line interface to the toolkit. - """ - ) - diff --git a/applications/easysvm/splicesites/__init__.py b/applications/easysvm/splicesites/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/applications/easysvm/splicesites/test_gc.py b/applications/easysvm/splicesites/test_gc.py deleted file mode 100644 index 2f8a23efa09..00000000000 --- a/applications/easysvm/splicesites/test_gc.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python - -import bz2 -import time -import sys -import numpy -import numpy.matlib -from splicesites.utils import create_dataset -from esvm.utils import calcroc -from esvm.experiment import crossvalidation -from esvm.mldata import init_datasetfile - -def test_gc(gcfilename): - """ - Check the gc content files for conflicting labels - """ - fp = init_datasetfile(gcfilename,'vec') - (examples,labels) = fp.readlines() - print '%d positive and %d negative examples' % (sum(labels>0.0),sum(labels<0.0)) - - distance = sqr_dist(numpy.matrix(examples),numpy.matrix(examples)) - labdist = numpy.matrix(labels).T*numpy.matrix(labels) - #difflab = numpy.where(labdist.A<0,distance,numpy.matlib.ones((len(labels),len(labels)))) - contracount = 0 - for ix in xrange(len(labels)): - for iy in xrange(ix+1,len(labels)): - if labdist[ix,iy]<0 and distance[ix,iy]<0.01: - contracount += 1 - print distance.shape, labdist.shape - #print '%d identical examples with opposing labels' %len(numpy.unique(numpy.where(difflab==0)[0])) - print '%d identical examples with opposing labels' % contracount - - -def sqr_dist(a,b): - """Compute the square distance between vectors""" - dot_a = numpy.sum(numpy.multiply(a,a),axis=0).T - dot_b = numpy.sum(numpy.multiply(b,b),axis=0).T - unitvec = numpy.matlib.ones(dot_a.shape) - D = 2.0*a.T*b - - for ix,bval in enumerate(dot_b): - D[:,ix] = dot_a - D[:,ix] + numpy.kron(bval,unitvec) - - return D - - -if __name__ == '__main__': - test_gc('C_elegans_don_freq.csv') - test_gc('C_elegans_acc_freq.csv') - diff --git a/applications/easysvm/splicesites/tutorial_example.py b/applications/easysvm/splicesites/tutorial_example.py deleted file mode 100644 index 8cb7aa9e103..00000000000 --- a/applications/easysvm/splicesites/tutorial_example.py +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env python - -############################################################################################# -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, see http://www.gnu.org/licenses # -# or write to the Free Software Foundation, Inc., 51 Franklin Street, # -# Fifth Floor, Boston, MA 02110-1301 USA # -# # -############################################################################################# - -import bz2 -import time -import sys -from splicesites.utils import create_dataset, create_modsel -from esvm.utils import calcroc -from esvm.experiment import crossvalidation -from esvm.mldata import init_datasetfile -from numpy.linalg import norm -import numpy - -def write_results(f, results): - """ - Write out results - """ - - f.write('Kernel\tParameters\tC\tauROC\n'); - for i in xrange(len(results)): - C=results[i][0] - k_param=results[i][1] - param_name=k_param[1]['name'] - kernel=k_param[0] - if kernel.endswith('2'): - kernel=kernel[:-1] - kernel_parameters= param_name + '=' + `k_param[1][param_name]` - perf = 100*results[i][2] - - f.write('%s\t' % kernel) - f.write('%s\t' % kernel_parameters) - f.write('C=%2.2f\t' % C) - f.write('%2.1f%%\n' % perf) - -def normalize(examples, subtract_mean=False, divide_std=False, rescale=False, norm_one=False): - """ - Scale GC data to ... (be on a ball? just const? 0 mean, std 1?) - """ - - if subtract_mean: - # mean = 0.0 - mean=numpy.mean(examples, axis=1) - for i in xrange(examples.shape[1]): - examples[:,i]-=mean - - if divide_std: - # std = 1.0 - std=numpy.std(examples, axis=1) - for i in xrange(examples.shape[1]): - examples[:,i]/=(std+1e-10) - - if rescale: - # scale to have on average 1 on linear kernel diagonal - scale=numpy.sqrt(numpy.mean(numpy.diag(numpy.mat(examples).T*numpy.mat(examples)))) - examples/=scale - - if norm_one: - # ball/circle - for i in xrange(examples.shape[1]): - examples[:,i]/=norm(examples[:,i]) - - return examples ; - -def run_single_experiment(results, num_fold_cv, kernelname, kparam, C, examples, labels): - """ - Run a single experiment, i.e. for a fixed kernel and parameters - do num_fold cross-validation - """ - - param_name=kparam['name'] - print 'Running C =', C, kernelname.title(), 'Kernel with', param_name, '=', kparam[param_name] - (all_outputs, all_split) = crossvalidation(num_fold_cv, kernelname, kparam, C, examples, labels, 'dna', 'A') - results.append( (C, (kernelname, kparam), calcroc(all_outputs,labels)) ) - -def splice_example(Cs, gcfilename,seqfilename,seq2filename, plot=False): - """ - For the data files, apply the set of kernels - """ - # hyperparameters - num_fold_cv = 5 - - # The area under the receiver operating characteristic - results=[] - - # Read datasets - - # GC features - fp = init_datasetfile(gcfilename,'vec') - (gc_examples,gc_labels) = fp.readlines() - gc_examples = normalize(gc_examples, subtract_mean=True) - - if plot: - from pylab import scatter,show - color=['b','r'] - scatter(gc_examples[0,], gc_examples[1,], s=400*(gc_labels+2), c=''.join([ color[(int(i)+1)/2] for i in gc_labels]), alpha=0.1) - show() - - # 2 sequence features - fp = init_datasetfile(seq2filename,'mseq') - (dna2_examples,dna2_labels) = fp.readlines() - - # DNA sequences - fp = init_datasetfile(seqfilename,'seq') - (dna_examples,dna_labels) = fp.readlines() - - - #Define experiments to carry out - - experiments=( - # Linear kernel on GC content - ('linear', {'scale':1.0, 'name':'scale'}, (gc_examples, gc_labels)), - - # Polynomial kernel on GC content - ( 'poly', {'degree':3, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), - ( 'poly', {'degree':5, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), - - # Gaussian kernel on GC content - ('gauss', {'width':100.0, 'name':'width'}, (gc_examples, gc_labels)), - ('gauss', {'width':1.0, 'name':'width'}, (gc_examples, gc_labels)), - ('gauss', {'width':0.01, 'name':'width'}, (gc_examples, gc_labels)), - - # Spectrum kernel on 2 dna sequences - ('spec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), - ('spec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), - ('spec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), - - # Cumulative Spectrum kernel on 2 dna sequences - ('cumspec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), - ('cumspec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), - ('cumspec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), - - # Weighted degree kernel on dna sequences - ('wd', {'degree':1,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), - ('wd', {'degree':3,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), - ('wd', {'degree':5,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)) - ) - - - if Cs is None: - for C in (0.01, 0.1, 1, 2, 5, 10): - for e in experiments: - run_single_experiment(results, num_fold_cv, e[0], e[1], C, e[2][0], e[2][1]) - else: - for i in xrange(len(experiments)): - e=experiments[i] - run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i], e[2][0], e[2][1]) - - return results - -def get_best_results(results): - methods=('linear', 'poly', 'gauss', 'spec2', 'cumspec2', 'wd') - best_result=[] - for m in methods: - params=set() - for r in results: - if r[1][0]==m: - params.add(tuple(zip(r[1][1].keys(),r[1][1].values()))) - - for p in params: - m_result=0.0 - m_best=None - for r in results: - if r[1][0]==m and r[1][1]==dict(p) and r[2]>m_result: - m_result=r[2] - m_best=r - best_result.append(m_best) - return best_result - - -if __name__ == '__main__': - if len(sys.argv) > 1: - if sys.argv[1] == 'create_data': - create_dataset() - sys.exit(0) - elif sys.argv[1] == 'mselect': - results = splice_example(None, 'C_elegans_acc_modsel_gc.csv','C_elegans_acc_modsel_seq.csv','C_elegans_acc_modsel_seq2.csv') - #results = splice_example(None, 'C_elegans_acc_gc.csv','C_elegans_acc_seq.csv','C_elegans_acc_seq2.csv') - import pickle - pickle.dump(results, file('mselect_result.pickle','w')) - sys.exit(0) - elif sys.argv[1] == 'get_best': - import pickle - results=pickle.load(file('mselect_result.pickle')) - best_result=get_best_results(results) - write_results(sys.stdout, best_result) - - #print 'Cs=[', - #for e in best_result: - # print e[0], ",", - #print ']' - sys.exit(0) - else: - print "unknown argument" - sys.exit(1) - - # without any argument - starttime = time.time() - - Cs = [ 5, 10, 10, \ - 5, 0.01, 10, \ - 10, 10, 0.01, \ - 10, 10, 10, \ - 1, 1, 2 ] - - # run the experiment - results = splice_example(Cs, 'C_elegans_acc_gc.csv','C_elegans_acc_seq.csv','C_elegans_acc_seq2.csv', False) - - stoptime = time.time() - elapsedtime = time.strftime('Elapsed time (HH.MM:SS): %H.%M:%S',time.gmtime(stoptime-starttime)) - print elapsedtime - - write_results(file('results.txt','w'), results) - for curline in file('results.txt').readlines(): - print curline.strip() - -if __name__ == '__main__': - if len(sys.argv) > 1: - if sys.argv[1] == 'create_data': - create_dataset() - elif sys.argv[1] == 'create_modsel': - create_modsel() - sys.exit() - #main() - print 'results in results.txt' diff --git a/applications/easysvm/splicesites/utils.py b/applications/easysvm/splicesites/utils.py deleted file mode 100644 index 78322522e6b..00000000000 --- a/applications/easysvm/splicesites/utils.py +++ /dev/null @@ -1,288 +0,0 @@ -import random -import bz2 -import numpy -from numpy import array, where, concatenate -from numpy import kron, ones, sqrt, sum -from os.path import exists -from esvm.mldata import convert -try: - import arff - have_arff = True -except ImportError: - have_arff = False - - - -def create_dataset(): - """Read the file with first 100k sequences from C. elegans - and generate some easier datasets. - """ - - if not have_arff: - print 'import arff failed, currently cannot create data' - return - - # convert data to arff format - gen_arff('C_elegans_acc_100000.fasta.bz2','C_elegans_acc_gc.arff','C_elegans_acc_seq.arff',\ - 'C_elegans_acc_seq2.arff','C_elegans_acc_freq.arff',\ - num_seqs=100000,subset=True,overwrite=True,normalise=False,\ - max_pos=200,max_neg=2000) - - print 'Convert from arff to csv and fasta' - convert('C_elegans_acc_gc.arff','C_elegans_acc_gc.csv','vec') - convert('C_elegans_acc_seq.arff','C_elegans_acc_seq.csv','seq') - convert('C_elegans_acc_freq.arff','C_elegans_acc_freq.csv','vec') - convert('C_elegans_acc_seq2.arff','C_elegans_acc_seq2.csv','mseq') - convert('C_elegans_acc_seq.arff','C_elegans_acc_seq.fa','seq') - - -def create_modsel(): - """Read the file with last 100k sequences from C. elegans - and generate some easier datasets. - """ - - if not have_arff: - print 'import arff failed, currently cannot create data' - return - - # convert data to arff format - gen_arff('C_elegans_acc_modsel.fasta.bz2','C_elegans_acc_modsel_gc.arff','C_elegans_acc_modsel_seq.arff',\ - 'C_elegans_acc_modsel_seq2.arff','C_elegans_acc_modsel_freq.arff',\ - num_seqs=100000,subset=True,overwrite=True,normalise=False,\ - max_pos=200,max_neg=2000) - - print 'Convert from arff to csv and fasta' - convert('C_elegans_acc_modsel_gc.arff','C_elegans_acc_modsel_gc.csv','vec') - convert('C_elegans_acc_modsel_seq.arff','C_elegans_acc_modsel_seq.csv','seq') - convert('C_elegans_acc_modsel_freq.arff','C_elegans_acc_modsel_freq.csv','vec') - convert('C_elegans_acc_modsel_seq2.arff','C_elegans_acc_modsel_seq2.csv','mseq') - convert('C_elegans_acc_modsel_seq.arff','C_elegans_acc_modsel_seq.fa','seq') - - -def gen_arff(fastafilename,gcfilename,seqfilename,seq2filename,specfilename,\ - num_seqs=100000,subset=False,max_pos=200,max_neg=2000,\ - overwrite=False,normalise=True): - """If data not yet created, generate 2 arff files - - containing the two dimensional GC content before and after splice site - - containing the sequence around the splice site. - """ - if (exists(gcfilename) and exists(seqfilename)) and not overwrite: - return - - print 'Creating %s and %s from %s' % (gcfilename,seqfilename,fastafilename) - - if fastafilename.find('acc')!= -1: - # acceptor, AG at [40:42] - window = (-40, 197, 42) - elif fastafilename.find('don')!= -1: - # donor, GT or GC at [40:42] - window = (-40, 200, 42) - else: - print "Error: Cannot determine whether donor or acceptor" - - [strings, lab]=read_data(bz2.BZ2File(fastafilename), num_seqs, window) - # Only a subset of the examples are used. - if subset: - [strings, lab] = take_subset(strings, lab, max_pos, max_neg) - - gcs=count_gs_and_cs(strings, (0, -window[0]), (-window[0]+2, -window[0]+2+window[2])) - - seq_upstream = [] - seq_downstream = [] - for curstr in strings: - seq_upstream.append(curstr[0:-window[0]]) - seq_downstream.append(curstr[(-window[0]+2):(-window[0]+2+window[2])]) - seq_upstream = array(seq_upstream) - seq_downstream = array(seq_downstream) - - spec_up = count_nt_freq(seq_upstream) - spec_down = count_nt_freq(seq_downstream) - - if normalise: - gcs = normalise_features(gcs) - spec_up = normalise_features(spec_up) - spec_down = normalise_features(spec_down) - - # sequence file - alist = [('label',1,[]),('sequence',0,[])] - f = open(seqfilename,'w') - arff.arffwrite(f,alist,zip(lab,strings),name=fastafilename,comment='Converted from '+fastafilename) - f.close() - - # 2 sequence file - alist = [('label',1,[]),('upstream sequence',0,[]),('downstream sequence',0,[])] - f = open(seq2filename,'w') - arff.arffwrite(f,alist,zip(lab,seq_upstream,seq_downstream),\ - name=fastafilename,comment='Converted from '+fastafilename) - f.close() - - # gc contents - alist = [('label',1,[]),('upstream',1,[]),('downstream',1,[])] - data = [] - for ix,curlab in enumerate(lab): - data.append((curlab,gcs[0,ix],gcs[1,ix])) - f = open(gcfilename,'w') - arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename) - f.close() - - # spectrum - alist = [('label',1,[]),\ - ('upA',1,[]),('upC',1,[]),('upG',1,[]),('upT',1,[]),\ - ('downA',1,[]),('downC',1,[]),('downG',1,[]),('downT',1,[])] - data = [] - for ix,curlab in enumerate(lab): - data.append((curlab,spec_up[0,ix],spec_up[1,ix],spec_up[2,ix],spec_up[3,ix],\ - spec_down[0,ix],spec_down[1,ix],spec_down[2,ix],spec_down[3,ix])) - if len(specfilename)>0: - f = open(specfilename,'w') - arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename) - f.close() - - -def take_subset(strings, lab, max_pos=200, max_neg=2000): - """Take a subset of the classes to the maximum numbers determined by - max_pos and max_neg - """ - random.seed(123456789) - - pos_idx = where(lab>0)[0] - neg_idx = where(lab<0)[0] - num_pos = len(pos_idx) - num_neg = len(neg_idx) - - assert(num_pos < num_neg) - assert(max_pos < max_neg) - - max_pos = min(max_pos,num_pos) - max_neg = min(max_neg,num_neg) - - neg_sub_idx = array(random.sample(neg_idx,max_neg)) - assert(all(lab[neg_sub_idx]<0)) - pos_sub_idx = array(random.sample(pos_idx,max_pos)) - assert(all(lab[pos_sub_idx]>0)) - - strings = concatenate((strings[pos_sub_idx],strings[neg_sub_idx])) - lab = concatenate((lab[pos_sub_idx],lab[neg_sub_idx])) - - return (strings,lab) - -def balance_classes(strings, lab, max_examples=1200,ratio=5.0): - """Take a subset of negative examples such that - the number of examples in the negative class are limited to ratio. - - Also limit the maximum number of examples. - """ - random.seed(123456789) - - pos_idx = where(lab>0)[0] - neg_idx = where(lab<0)[0] - num_pos = len(pos_idx) - num_neg = len(neg_idx) - assert(num_pos < num_neg) - - max_pos = int(float(max_examples)/(ratio+1.0)) - - if num_pos < max_pos: - max_pos = num_pos - - pos_idx = pos_idx[:max_pos] - num_pos = len(pos_idx) - max_neg = int(num_pos*ratio) - if num_neg < max_neg: - max_neg = num_neg - - sub_idx = array(random.sample(neg_idx,max_neg)) - assert(all(lab[sub_idx]<0)) - - strings = concatenate((strings[pos_idx],strings[sub_idx])) - lab = concatenate((lab[pos_idx],lab[sub_idx])) - - return (strings,lab) - -def normalise_features(feats): - """Normalise each feature to zero mean and unit variance. - Assume features are column wise matrix. - - """ - (numdim,numex) = feats.shape - - M = sum(feats,axis=1)/numex - M = M.reshape(numdim,1) - - M2 = sum(feats**2,axis=1)/numex - M2 = M2.reshape(numdim,1) - SD = sqrt(M2-M**2) - onevec = ones((1,numex)) - feats = (feats - kron(onevec,M))/(kron(onevec,SD)) - - return feats - -def read_data(f, num, window): - """Read the fasta file containing splice sites.""" - labels=num*[0] - strings=num*[0] - - l1 = f.readline() - l2 = f.readline() - line = 0 - num_alt_consensus = 0 - while l1 and l2 and line 1 : - extension = sys.argv[1] - pylab.ioff() - create_figures(extension) diff --git a/applications/msplicer/LICENSE b/applications/msplicer/LICENSE deleted file mode 100644 index 5b6e7c66c27..00000000000 --- a/applications/msplicer/LICENSE +++ /dev/null @@ -1,340 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. diff --git a/applications/msplicer/Makefile b/applications/msplicer/Makefile deleted file mode 100644 index 91104e2868c..00000000000 --- a/applications/msplicer/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -release-dir := msplicer-0.3 - -release: - ( cd .. ; mkdir -p $(release-dir)/data ; \ - cp python/msplicer python/LICENSE python/README python/NEWS python/*.py $(release-dir) ; \ - tar cjvf $(release-dir).tar.bz2 $(release-dir) ) -clean: - rm -f *.pyc diff --git a/applications/msplicer/NEWS b/applications/msplicer/NEWS deleted file mode 100644 index be08d8d7af1..00000000000 --- a/applications/msplicer/NEWS +++ /dev/null @@ -1,6 +0,0 @@ -2008-05-18 Soeren Sonnenburg - - * mSplicer version 0.3 - - Adjust code to work with newer shogun versions. - - Require shogun version at least 0.6.2 as splice site prediction in - the new version is now orders of magnitude faster. diff --git a/applications/msplicer/README b/applications/msplicer/README deleted file mode 100644 index c2757148a4c..00000000000 --- a/applications/msplicer/README +++ /dev/null @@ -1,110 +0,0 @@ -This is the mSplicer program accompanying the PLoS "Improving the C. elegans -genome annotation using machine learning" submission. by Gunnar Rätsch, Sören -Sonnenburg, Jagan Srinivasan, Hanh Witte, Klaus-Robert Müller, Ralf Sommer and -Bernhard Schölkopf. Published in PLoS Computational Biology, February, 2007. - -ABSTRACT: - -For modern biology, precise genome annotations are of prime importance as they -allow the accurate definition of genic regions. We employ state of the art -machine learning methods to assay and improve the accuracy of the genome -annotation of the nematode Caenorhabditis elegans. The proposed machine -learning system is trained to recognize exons and introns on the unspliced mRNA -utilizing recent advances in support vector machines and label sequence -learning. In 87% (coding and untranslated regions) and 95% (coding regions -only) of all genes tested in several out-of-sample evaluations, our method -correctly identified all exons and introns. Notably, only 37% and 50%, -respectively, of the presently unconfirmed genes in the C. elegans genome -annotation agree with our predictions, thus we hypothesize that a sizable -fraction of those genes are not correctly annotated. A retrospective evaluation -of the Wormbase WS120 annotation [1] of C. elegans reveals that splice form -predictions on unconfirmed genes in WS120 are inaccurate in about 18% of the -considered cases, while our predictions deviate from the truth only in 10 − -13%. We experimentally analyzed 20 controversial genes on which our system and -the annotation disagree, confirming the superiority of our predictions. While -our method correctly predicted 75% of those cases, the standard annotation was -never completely correct. The accuracy of our system is further corroborated by -a comparison with two other recently proposed systems that can be used for -splice form prediction: SNAP and ExonHunter. We conclude that the genome -annotation of C. elegans and other organisms can be greatly enhanced using -modern machine learning technology. Availabibility: - -Training the mSplicer involves solving a relatively large linear optimization -problem, which we have implemented in MATLAB using the CPLEX optimization -package. Additionally we have developed a standalone tool for predicting the -splice form for C. elegans sequences implemented in PYTHON and C++ available -under the General Public License. It is based on python scripts that call -methods implemented in C++ for predicting splice sites using Support Vector -Machines [2] and Dynamic Programming for splice form prediction. These routines -are part of the freely available Shogun toolbox for large scale kernel learning -[3] which is available under http://www.shogun-toolbox.org. - -If you have questions regarding the results in [4], please consult -http://www.msplicer.org or contact Gunnar Rätsch. In case you have difficulties -using the provided software, please contact Sören Sonnenburg or Gunnar Rätsch. - -Following a statistical setup common in machine learning, we trained our -system on 60% of the available cDNA sequences currently known for C. elegans -(based on Wormbase [5], version WS120). The remaining 40% of the cDNA sequences -were used to generate an independent set for out-of-sample testing. -Additionally, we used available EST sequences (dbEST [6], as of 19/02/2004) to -maximally extend the cDNA sequences at the 5’ and 3’ ends. For training, we did -not use any EST sequences overlapping with the 40% of the cDNA sequences for -out-of-sample prediction. - -MSPLICER PROGRAM REQUIREMENTS: - -The stand alone linux binary does not need further compilation/libraries and -should run out of the box (tested on Debian sarge and Debian etch). - -For the python version you need a working python 2.4 installation with numpy -(version 1.0 or later) and the shogun toolbox (version 0.6.2 or later) -- which is available from http://www.shogun-toolbox.org for Linux, MacOSX, -cygwin/win32. If you are running Debian GNU Linux, shogun 0.6.2 is available in -debian unstable http://packages.debian.org/unstable/science/shogun-python-modular. - -MSPLICER PROGRAM RUNNING TIME AND MEMORY REQUIREMENTS: - -mSplicer requires about 100M of memory for short sequences. Memory requirements -don't grow much (a additional linear term w.r.t. the length of the input -sequence). On first run with a new model (see --model option below), -msplicer will load and decompress the .bz2 compressed model file and store it -as a python native pickle dump, which increases startup times a lot. -Due to the optimizations in [3] splice form prediction (layer 1) times -won't change much for many/long sequences. Otherwise mSplicer running times are -dominated by computing the viterby path (layer 2). For example computing -the output of the 708 sequences (2.3Mb) of elegans_WS160_mSplicer_val.fa takes -on a 2GHz machine about 15 minutes and 170M of memory. - -MSPLICER PROGRAM USAGE: - -./msplicer fasta_file.fa - -This will read all entries in the .fa file and print a .gff file with the -predictions for each of the entries to stdout. One may optionally specify the -start and stop of the transcript via --start / --stop and -the model via --model one of WS120, WS120gc, WS150, WS160, WS160gc. Note that - is zero based. - - -REFERENCES: - -[1] Harris T, Chen N, Cunningham F, et al. (2004) Wormbase, a multi-species - resource for nematode biology and genomics. Nucl Acids Res 32. D411-7. - -[2] Cortes, C, Vapnik, VN. Support-vector networks. Machine Learning, - 20(3):273--297, 1995. - -[3] Sonnenburg, S, Rätsch, G, Schäfer, C, Schölkopf, B. Large Scale Multiple - Kernel Learning. Journal of Machine Learning Research,7:1531-1565, - July 2006, K.Bennett and E.P.-Hernandez Editors. - -[4] Rätsch, G, Sonnenburg, S, Srinivasan, J, Witte, H, Müller, KR, Sommer, R, - and Schölkopf, B (2007). Improving the C. elegans genome annotation using - machine learning. PLoS Computational Biology 3(2):e20. - -[5] Schwarz E, Antoshechkin I, Bastiani C, et al (2006) Wormbase, better - software, richer content. Nucleic Acids Res 34:D475–8. - -[6] Boguski M, Tolstoshev TLC (1993). dbEST–database for expressed sequence - tags. Nat Genet 4,332–3. diff --git a/applications/msplicer/content_sensors.py b/applications/msplicer/content_sensors.py deleted file mode 100644 index ee0bc39ad7e..00000000000 --- a/applications/msplicer/content_sensors.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2007 Soeren Sonnenburg -# Written (W) 2007 Gunnar Raetsch -# Copyright (C) 2007-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -import numpy - -class content_sensors: - def __init__(self, model): - self.dict_weights_intron=numpy.array(model.dict_weights_intron, dtype=numpy.float64) - self.dict_weights_coding=numpy.array(model.dict_weights_coding, dtype=numpy.float64) - - self.dicts=numpy.concatenate((self.dict_weights_coding,self.dict_weights_intron, self.dict_weights_coding, self.dict_weights_intron, self.dict_weights_coding,self.dict_weights_intron, self.dict_weights_coding, self.dict_weights_intron), axis=0) - - self.dicts[0, 64:] = 0 # only order 3 info - self.dicts[1, 64:] = 0 # only order 3 info - self.dicts[2, 0:64] = 0 # only order 4 info - self.dicts[2, 320:] = 0 - self.dicts[3, 0:64] = 0 # only order 4 info - self.dicts[3, 320:] = 0 - self.dicts[4, 0:320] = 0 # only order 5 info - self.dicts[4, 1344:] = 0 - self.dicts[5, 0:320] = 0 # only order 5 info - self.dicts[5, 1344:] = 0 - self.dicts[6, 0:1344] = 0 # only order 6 info - self.dicts[7, 0:1344] = 0 # only order 6 info - - self.model = model - - def get_dict_weights(self): - return self.dicts.T - - def initialize_content(self, dyn): - dyn.init_svm_arrays(len(self.model.word_degree), len(self.model.mod_words)) - - word_degree = numpy.array(self.model.word_degree, numpy.int32) - dyn.init_word_degree_array(word_degree) - - mod_words = numpy.array(4**word_degree, numpy.int32) - dyn.init_num_words_array(mod_words) - - cum_mod_words=numpy.zeros(len(mod_words)+1, numpy.int32) - cum_mod_words[1:] = numpy.cumsum(mod_words) - dyn.init_cum_num_words_array(cum_mod_words) - - dyn.init_mod_words_array(numpy.array(self.model.mod_words, numpy.int32)) - dyn.init_sign_words_array(numpy.array(self.model.sign_words, numpy.bool)) - dyn.init_string_words_array(numpy.zeros(len(self.model.sign_words), numpy.int32)) - - assert(dyn.check_svm_arrays()) diff --git a/applications/msplicer/convert_mat.m b/applications/msplicer/convert_mat.m deleted file mode 100644 index 6c30878e5d4..00000000000 --- a/applications/msplicer/convert_mat.m +++ /dev/null @@ -1,213 +0,0 @@ -function convert_mat() - -addpath ../matlab -fnames={'../matlab/msplicer_elegansWS120_gc=0_orf=0.mat', '../matlab/msplicer_elegansWS120_gc=0_orf=1.mat', '../matlab/msplicer_elegansWS120_gc=1_orf=0.mat', '../matlab/msplicer_elegansWS150_gc=0_orf=0.mat', '../matlab/msplicer_elegansWS160_gc=0_orf=0.mat', '../matlab/msplicer_elegansWS160_gc=1_orf=0.mat', '../matlab/msplicer_elegansWS160_gc=1_orf=1.mat'}; - -for i=1:length(fnames), - clear L; - L=load(fnames{i}); - targetname=[ './data', fnames{i}(10:end-3), 'dat' ]; - - fid=fopen(targetname,'wb'); - - fprintf(fid, '%%msplicer definition file version: 1.0\n\n'); - fprintf(fid, 'bins=%d\n', L.bins); - fprintf(fid, 'dict_weights_intron='); - write_mat(fid, L.dict_weights_train.intron); - fprintf(fid, 'dict_weights_coding='); - write_mat(fid, L.dict_weights_train.coding); - fprintf(fid,'\n'); - - % has to fit to the python code (order of array in plif.py) - penids.acceptor = 0 ; - penids.donor = 1 ; - penids.first_coding_len = 2 ; - penids.last_coding_len = 3 ; - penids.coding_len = 4 ; - penids.single_coding_len = 5 ; - penids.intron_len = 6 ; - - if ~isempty(findstr(targetname,'_orf=1')) - make_a_trans_orf(fid, L, penids); - else - make_a_trans_noorf(fid, L, penids); - end - - %penalties - fprintf(fid,'%%penalties\n'); - write_penalty(fid, 'penalty_acceptor', L.penalty.acceptor); - write_penalty(fid, 'penalty_donor', L.penalty.donor); - write_penalty(fid, 'penalty_coding_len', L.penalty.coding_len); - write_penalty(fid, 'penalty_first_coding_len', L.penalty.first_coding_len); - write_penalty(fid, 'penalty_last_coding_len', L.penalty.last_coding_len); - write_penalty(fid, 'penalty_single_coding_len', L.penalty.single_coding_len); - write_penalty(fid, 'penalty_intron_len', L.penalty.intron_len); - write_penalty(fid, 'penalty_coding', L.penalty.coding); - write_penalty(fid, 'penalty_coding2', L.penalty.coding2); - write_penalty(fid, 'penalty_coding3', L.penalty.coding3); - write_penalty(fid, 'penalty_coding4', L.penalty.coding4); - write_penalty(fid, 'penalty_intron', L.penalty.intron); - write_penalty(fid, 'penalty_intron2', L.penalty.intron2); - write_penalty(fid, 'penalty_intron3', L.penalty.intron3); - write_penalty(fid, 'penalty_intron4', L.penalty.intron4); - write_penalty(fid, 'penalty_transitions', L.penalty.transitions); - fprintf(fid,'\n'); - - acc=load(L.accfname); - fprintf(fid,'%%acceptor splice\n'); - fprintf(fid, 'acc_splice_b=%e\n', acc.b); - fprintf(fid, 'acc_splice_order=%d\n', acc.PAR.order); - fprintf(fid, 'acc_splice_window_left=%d\n', 60); - fprintf(fid, 'acc_splice_window_right=%d\n', 79); - fprintf(fid, 'acc_splice_alphas='); - write_mat(fid, acc.alphas); - fprintf(fid, 'acc_splice_svs='); - write_string(fid, acc.XT); - fprintf(fid,'\n'); - - don=load(L.donfname); - fprintf(fid,'%%donor splice\n'); - fprintf(fid, 'don_splice_b=%e\n', don.b); - fprintf(fid, 'don_splice_use_gc=%d\n', don.PAR.use_gc); - fprintf(fid, 'don_splice_order=%d\n', don.PAR.order); - fprintf(fid, 'don_splice_window_left=%d\n', 80); - fprintf(fid, 'don_splice_window_right=%d\n', 59); - fprintf(fid, 'don_splice_alphas='); - write_mat(fid, don.alphas); - fprintf(fid, 'don_splice_svs='); - write_string(fid, don.XT); - - fclose(fid); - - system(sprintf('bzip2 -9 "%s"\n', targetname)); -end - -function make_a_trans_orf(fid, L, penids) - [A,p,q,info,penalties,orf_info]=gen_splice_model_orf(penids); - write_model(fid, L, A, p, q, info, penalties, orf_info) - -function make_a_trans_noorf(fid, L, penids) - [A, p, q, info, penalties, orf_info]=gen_splice_model_noorf(penids); - write_model(fid, L, A,p,q, info, penalties, orf_info) - -function write_model(fid, L, A,p,q, info, penalties, orf_info) - A(~isinf(A))=L.penalty.transitions.penalty; - %idx=[]; - %fieldns=fieldnames(info); - %for i=1:length(fieldns) - % if isequal(fieldns{i}, 'cnt') - % continue - % end - % idx=[idx getfield(info, fieldns{i})]; - %end - %A=A(idx,idx); - - a_trans = zeros(3,sum(~isinf(A(:)))) ; - k=0 ; - for i=1:size(A,1) - idx = find(~isinf(A(i,:))) ; - val = A(i,idx) ; - a_trans(1,k+1:k+length(idx))=i-1 ; - a_trans(2,k+1:k+length(idx))=idx-1 ; - a_trans(3,k+1:k+length(idx))=val ; - k=k+length(idx) ; - end ; - a_trans=a_trans' ; - [tmp,idx]=sort(a_trans(:,2)) ; - a_trans = a_trans(idx,:)' ; - - fprintf(fid, 'msplicer_a_trans='); - write_mat(fid, a_trans); - fprintf(fid, 'msplicer_p='); - p(isinf(p))=32768; - write_mat(fid, p(:)); - fprintf(fid, 'msplicer_q='); - q(isinf(q))=32768; - write_mat(fid, q(:)); - fprintf(fid,'\n'); - - % start-state: 0 - % exon-start-state: 1 - % donor-state: 2 - % acceptor-state: 3 - % exon-end-state: 4 - % stop-state: 5 - statedescr = zeros(1,info.cnt) ; - statedescr(info.start) = 0 ; - statedescr(info.atg) = 1 ; - statedescr(info.don) = 2 ; - statedescr(info.acc) = 3 ; - statedescr(info.stop) = 4 ; - statedescr(info.final) = 5 ; - - fprintf(fid, 'statedescr='); - write_mat(fid, statedescr); - fprintf(fid,'\n'); - - plifidmat = penalties ; - plifidmat(plifidmat==0)=-1 ; - - fprintf(fid, 'plifidmat='); - write_mat(fid, plifidmat); - fprintf(fid,'\n'); - - fprintf(fid, 'orf_info='); - write_mat(fid, orf_info); - fprintf(fid,'\n'); - - word_degree = [3,4,5,6] ; - mod_words = [1,1,1,1,1,1,1,1; - 0,0,0,0,0,0,0,0] ; - sign_words = [1,1,1,1,1,1,1,1] ; - - fprintf(fid, 'word_degree='); - write_mat(fid, word_degree); - fprintf(fid,'\n'); - - fprintf(fid, 'mod_words='); - write_mat(fid, mod_words); - fprintf(fid,'\n'); - - fprintf(fid, 'sign_words='); - write_mat(fid, sign_words); - fprintf(fid,'\n'); - - info - -function write_penalty(fid, name, x) - - if isfield(x, 'boundaries') - fprintf(fid, '%s_boundaries=', name); - write_mat(fid, x.boundaries(:,1:(end-1))); - else - warning('boundaries field does not exist!') - end - fprintf(fid, '%s_penalty=', name); - write_mat(fid, x.penalty'); - -function write_string(fid, x) - fprintf(fid, '[\n'); - for i=1:size(x,2), - fprintf(fid, '%c', x(1:(size(x,1)-1),i)); - fprintf(fid, '%c\n', x(size(x,1),i)); - end - fprintf(fid, ']\n'); - -function write_mat(fid, x) - if size(x,1)==1, - fprintf(fid, '['); - fprintf(fid, '%e, ', x(1:(length(x)-1))); - fprintf(fid, '%e', x(end)); - else - fprintf(fid, '['); - for i=1:size(x,2), - fprintf(fid, '%e, ', x(1:(size(x,1)-1),i)); - - if iC01F1.1_DNA_-400:+400 -agctcctcactacaagaaaaacgataagattatgcaattataggatactctgtaaaaaaa -accattcagaccgtttttggacgagcaaaatgaaaaattcgaaaatttagcggaatttgg -cttttctgagacaattttttaaagaaaaatatttacaaaagtcttaaattcaggaaatcc -acaaaaaaaagcacgaaaaataatcgcaaatgaaaaaaaattcaaataaaaacttcaaaa -accgtgatttctcaattttagccaaattccgagggaatttgtggttttcttgaattttag -acttttttgaaatttacctcgaaagaattcagttttttcaggattttttcttattttaat -gcgtaaaacatcccatttttaacccaaattatttccagaaatgtctggcctgaaacctgt -caaaccagaaggagttcaaagcgagttcagtgtccgcgttgcaaaacgaagcgatgatat -ccgttactctgtaatgatgttcaacggaatggacaaagtggacacatcaaaatggacaat -agacagtggtgttacaatggagagagaggataatcaacgtgtaattctatcaacacagac -agttcaagaatacggagaaggatccgagtatggaaaagctgcgagggaagaagctcgccg -aaagaaatatggaagacaatcaaaaaaatatcgacttgataatcagccatggaagatggc -attcactgagccagaaggacggcagaggcaaatgagaggaattcgagaaggtggtgcaaa -tgagcatgctgattattgggtttttctgaaaccaaatcaatcttctgagtttaaagctta -taaagtcgatgaatggcataaattcctgccagcgattactcataaaactcttgatattga -tcaagccgaggagcaattctctcagagatataaagttatgaatcaattcgctttgaaagc -agcgatacagaaccaattgagtgcgacggatgaatcggaaatgacagagcagcagaaacg -tctactgaaaattaaggatgaggcgagctctgatgattcggatggtgatgatgagggaga -gggtggtgatgatggaaaaaaggcgaagaataagaaaaagaagaagaagaatgcgaaacc -ggcgaaagagaagaggcagagggttgaggataaggatgatgttgctagtgagtaatagct -ctggcggattttttgaaatatcgaaggaataattcggaaaaattccgattaaaaacatcg -atttttgtttgaaatgccgaaaaataggaattttttcgaccttttcaaagaaaatcgaaa -aattaaacacaaattgaaaaaaaaattcgccattaatttttccgatacttcaaaaaaatc -gaatttttgctgttttttttttgaaaaaaccgattttccgaaaattcaacaaaaaaaaag -aaattttgaaaaatctaattttttgtttgtttaaaatgtgaaaaatcgaaaacttaacga -aaattgaaacaaaaattcgacatttcgtaaaaaaatggatttttccattaatatcgaaaa -taaataaaaaatttcggatttttttgtttgaagtatcggaaaatatgaaaattttctagt -ttttcagaaaaaaatcgtttttctatcttttttttttgaggaaaaatcaaaattttcaca -catttccggcattttaaacgaaaaaatatagatttttgatttcagaaaaatagaaaatgt -tcaagattatggtttctcattttacaaaaaaaaaaaattttaagaatacgagtcatcgga -cggagaggacgagggccgcgagtacgactacatttctgacagtggaactgattctgagtt -ggtttctttttttgaaaaaatttttaattttattttggaaaagaaaaaataacaaatctt -ttaaaaattcgtttttgattttctgaaaaaaattaaaaaatttcgaatttctctttttaa -aaaattcgttttgcgattcttattctaaaaaataaaaaaatcgggaaaaaatttattttt -cgatttattatgaaaaaaaatatttaaacaaaatataattgcaaaaataaaaataaaaaa -atttactaattagtcggaaaaattgggaaaaaatttaaaataccgaaaaaaatcaaacat -tttcgatgtttgatttttttttttctgaaaaaattataaaatagaatattttcgattttc -gctgggaaaaatttaaaaaatcgaagtttttcggttttttttctagaaaatcgataataa -atttccattttctaattaaaaattgttattttccaattttcttagtaaaattgaaaacta -aaaaaaagtccgaattttctgaaaaaaaatttgaaaaattgaaaatcttttccatttttc -gtttttctctgaaaaaaatgtaataaatcgaaaaaaaaattaatttttcgattttccctt -caaaaaaaattgaaaaatcaatgaattacccattttcagccgtgaacaagttccatcaga -cgaaaaaatcgagaaacaactagtcggagttgctgaagaagaaggagcacgtgaatctga -tagcagtgaatctgaagatgatttaacgaagaaattaatgaaaccatatggtgataagaa -aaaaggaaatgatattgaagaacgtgattcatctggaacagattctgatgtttccgacac -tgaaaaactcgattctgtagtttttatgaaggctaacaaggatggagaaggcggctcggg -aggaaccggaaaaaaacgtccaccaactgaagattctgatcttaaaatggataatcttgg -tccgagtgatgcgaaaaaagcgaaaccggctgttaaatttgaggaaggactcaatgagga -aactgttcgcagatatttgcgtcgtaaaccgcatacaacgaaggtgaattggaaaattcg -aaaaataggaaaaatttcagaaaaaaaaatcgattttctcaggataaaaaaaacaacaaa -aaaacgaaaaaaaaaatgtttaaaaagttaacaaaaaaaggataataaactttttattta -tttttaaaattaaaattatgaaacttgtgataattgtaagaaggaaattcaatttttgag -aaaattgcacgaaaactttaaaaatatttaaaatataggtataattttttcgaacgaaaa -aaacagaaaaatatataagttaatgcagaaaataccaagaaagttattttccgcgaaaat -ttcgatttttcggagaaaaattaaaaattaaaaaaaaatcaattttttggtttatattga -gaagaaaacacatgagaaaaaaattcaattttatgctaaaaataagaaaaacagccaaat -tgaaatttttcgatttttcaaaacctaaaaatcaaaaaactggtaacaaaatagattttt -tgttttcaaaaaatttaatactatttttttttataataaatttaaatctaatcaatttaa -cctatgagcatttttaaaaatttttaattaaaaaaaatagaaaatttttaatttttcaca -aaattcggaataaaaaactttagaaaaatgaattatgctagaaattgcatcatttgttca -taaaaaatgttaaaaatttgacaaaaaagcaagaaaactctaatataaattttcaattaa -aaaaaattataatttaaataaaattgaaagctgaaaaacctattcaaaaatcgctcctgt -accaaaaatttcagcgaaatctgaaaaaatccaaattttctcataataaaattccatttt -ttgcaggaactcctccacaaaatgaacggaaaatgcggaaatatgagcaaatctgaaatg -gtaacccaactggcttcaattctgaaagcaatcgaaccgaatcaatctcgacaattgaag -ggaaagaaggaagtactcttcttctcacttgtcaacactatcgcctataattttaactat -tttttcttctcttaaaaattatacaatccttcgcatttttattttaaattcaaattttcc -cgcccaaaattctgtaaatgaaccaaaagtttcgcgattaaaaaaattttatcgaaatat -ttaagtgcaaaatattctaaaagctaggaattatagatttttcaaaaaaattcaaataat -tatgcaagaatcacttgatcaaagccatccactcagcccacaggccttcaacattctctc -ccttattctgaatttcagtccgagcatcctgaacaatcacttctccctgatcatttacaa -ttctcatggctggaattgtcttgacttcgtatttttgcatcagggacctggaaaaaaaat -attattcaatcgtagaaaattgtgattt diff --git a/applications/msplicer/genomic.py b/applications/msplicer/genomic.py deleted file mode 100644 index cb49e82b7a0..00000000000 --- a/applications/msplicer/genomic.py +++ /dev/null @@ -1,152 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2007 Soeren Sonnenburg -# Written (W) 2006-2007 Mikio Braun -# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society -# - -import time -from string import maketrans - -""" this function is 100% compatible to the matlab function, thus it is one based (!) - use one_based=False if needed, then however the interval is [start,stop) (excluding stop) -""" -def load_genomic(chromosome, strand, start, stop, genome, one_based=True): - fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome[3:] + '.flat' - f=file(fname) - if one_based: - f.seek(start-1) - str=f.read(stop-start+1) - else: - f.seek(start) - str=f.read(stop-start) - - if strand=='-': - return reverse_complement(str) - elif strand=='+': - return str - else: - print 'strand must be + or -' - raise KeyError - -""" read a table browser ascii output file (http://genome.ucsc.edu/cgi-bin/hgTables) """ -def read_table_browser(f): - table=dict(); - for l in f.readlines(): - if not l.startswith('#'): - (name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,alignID)=l.split('\t') - exonStarts=[ int(i) for i in exonStarts.split(',')[:-1] ] - exonEnds=[ int(i) for i in exonEnds.split(',')[:-1] ] - - table[name]={ 'chrom': chrom, 'strand': strand, 'txStart': int(txStart), 'txEnd': int(txEnd), - 'cdsStart': int(cdsStart), 'cdsEnd': int(cdsEnd), 'exonCount': int(exonCount), 'exonStarts': exonStarts, - 'exonEnds': exonEnds, 'proteinID': proteinID, 'alignID': alignID[:-1] } - - return table - -""" get promoter region """ -def get_promoter_region(chromosome, strand, gene_start, gene_end, genome, length): - - if strand == '+': - return load_genomic(chromosome, strand, gene_start, gene_start+length, genome, one_based=False) - elif strand == '-': - return load_genomic(chromosome, strand, gene_end, gene_end+length, genome, one_based=False) - else: - print 'unknown strand' - return None - -""" reverse + complement a DNA sequence (only letters ACGT are translated!) - FIXME won't work with all the rest like y... """ -def reverse_complement(str): - t=maketrans('acgtACGT','tgcaTGCA') - return str[len(str)::-1].translate(t) - -""" works only with .fa files that contain a single entry """ -def read_single_fasta(fname): - str=file(fname).read() - str=str[str.index('\n')+1:].replace('\n','') - return str - -""" writes only single enty .fa files """ -def write_single_fasta(fname, name, str, linelen=60): - header= '>' + name + '\n' - f=file(fname,'a') - f.write(header) - for i in xrange(0,len(str),linelen): - f.write(str[i:i+linelen]+'\n') - f.close() - -""" read fasta as dictionary """ -def read_fasta(f): - fasta=dict() - - for s in f.readlines(): - if s.startswith('>'): - key=s[1:-1] - fasta[key]="" - else: - fasta[key]+=s[:-1] - - return fasta - -""" write dictionary fasta """ -def write_fasta(f, d, linelen=60): - for k in sorted(d): - f.write('>%s\n' % k); - s = d[k] - for i in xrange(0, len(s), linelen): - f.write(s[i:i+linelen] + '\n') - -def write_gff(f, (source, version), (seqtype, seqname), descrlist, skipheader=False): - """ writes a gff version 2 file - descrlist is a list of dictionaries, each of which contain these fields: - [attributes] [comments] - """ - - if not skipheader: - f.write('##gff-version 2\n') - f.write('##source-version %s %s\n' % (source, version) ) - - t=time.localtime() - f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6]) - - f.write('##Type %s %s\n' % (seqtype, seqname) ) - - for d in descrlist: - f.write('%s\t%s\t%s\t%d\t%d\t%f\t%s\t%d' % (d['seqname'], d['source'], - d['feature'], d['start'], d['end'], - d['score'], d['strand'], d['frame'])) - if d.has_key('attributes'): - f.write('\t' + d['attributes']) - if d.has_key('comments'): - f.write('\t' + d['comments']) - f.write('\n') - - -if __name__ == '__main__': - import sys,os - - table=read_table_browser(file('/fml/ag-raetsch/home/sonne/addnet/tfbs/share/data/wt1_bibliosphere_table_browser_hg17.txt')) - print table.keys() - print table[table.keys()[0]] - d = { 'ahoernchen' : 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT', - 'bhoernchen' : 'GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA' } - - write_fasta(sys.stdout, d) - write_fasta(file('/tmp/test.fa','w'), d) - - d2 = read_fasta(file('/tmp/test.fa')) - os.unlink('/tmp/test.fa') - - print d - print d2 - print d == d2 - - p=load_genomic('chr5', '+', 100000, 100100,'hg17') - n=load_genomic('chr1', '-', 3000000, 3001000,'mm7') - write_single_fasta('bla.fa','bla', 'ACGT') - n2=read_single_fasta('bla.fa') diff --git a/applications/msplicer/model.py b/applications/msplicer/model.py deleted file mode 100644 index 6c71b86cad0..00000000000 --- a/applications/msplicer/model.py +++ /dev/null @@ -1,307 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2008 Soeren Sonnenburg -# Written (W) 2007 Gunnar Raetsch -# Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -import sys -from numpy import mat,array,inf,any,reshape,int32 - -class model(object): - #model matrices - bins=None - dict_weights_intron=None - dict_weights_coding=None - a_trans=None - p=None - q=None - - statedescr = None - plifidmat = None - orf_info = None - use_orf = None - - word_degree = None - mod_words = None - sign_words = None - - #penalties - penalty_acceptor_boundaries=None - penalty_acceptor_penalty=None - penalty_donor_boundaries=None - penalty_donor_penalty=None - penalty_coding_len_boundaries=None - penalty_coding_len_penalty=None - penalty_first_coding_len_boundaries=None - penalty_first_coding_len_penalty=None - penalty_last_coding_len_boundaries=None - penalty_last_coding_len_penalty=None - penalty_single_coding_len_boundaries=None - penalty_single_coding_len_penalty=None - penalty_intron_len_boundaries=None - penalty_intron_len_penalty=None - penalty_coding_boundaries=None - penalty_coding_penalty=None - penalty_coding2_boundaries=None - penalty_coding2_penalty=None - penalty_coding3_boundaries=None - penalty_coding3_penalty=None - penalty_coding4_boundaries=None - penalty_coding4_penalty=None - penalty_intron_boundaries=None - penalty_intron_penalty=None - penalty_intron2_boundaries=None - penalty_intron2_penalty=None - penalty_intron3_boundaries=None - penalty_intron3_penalty=None - penalty_intron4_boundaries=None - penalty_intron4_penalty=None - penalty_transitions_penalty=None - - #acceptor - acc_splice_b=None - acc_splice_order=None - acc_splice_window_left=None - acc_splice_window_right=None - acc_splice_alphas=None - acc_splice_svs=None - - #donor - don_splice_b=None - don_splice_order=None - don_splice_use_gc=None - don_splice_window_left=None - don_splice_window_right=None - don_splice_alphas=None - don_splice_svs=None - - - -def parse_file(file): - m=model() - - l=file.readline(); - - if l != '%msplicer definition file version: 1.0\n': - sys.stderr.write("\nfile not a msplicer definition file\n") - return None - - while l: - if not ( l.startswith('%') or l.startswith('\n') ): # comment - if m.bins is None: m.bins=parse_value(l, 'bins') - if m.dict_weights_intron is None: m.dict_weights_intron=parse_matrix(l, file, 'dict_weights_intron') - if m.dict_weights_coding is None: m.dict_weights_coding=parse_matrix(l, file, 'dict_weights_coding') - if m.a_trans is None: m.a_trans=parse_matrix(l, file, 'msplicer_a_trans') - if m.p is None: - m.p=parse_vector(l, file, 'msplicer_p') - if m.p is not None: - m.p[m.p==32768]=-inf - if m.q is None: - m.q=parse_vector(l, file, 'msplicer_q') - if m.q is not None: - m.q[m.q==32768]=-inf - - if m.statedescr is None: - m.statedescr=parse_vector(l, file, 'statedescr') - if m.statedescr is not None: - m.statedescr=array(m.statedescr, int32) - - if m.plifidmat is None: - m.plifidmat=parse_matrix(l, file, 'plifidmat') - if m.plifidmat is not None: - m.plifidmat = array(m.plifidmat, int32) - - if m.orf_info is None: - m.orf_info=parse_matrix(l, file, 'orf_info') - if m.orf_info is not None: - m.orf_info=array(m.orf_info, int32).T - if any(m.orf_info != -1): - m.use_orf = True - else: - m.use_orf = False - - if m.word_degree is None: m.word_degree=parse_vector(l, file, 'word_degree') - if m.mod_words is None: m.mod_words=parse_matrix(l, file, 'mod_words') - if m.sign_words is None: m.sign_words=parse_vector(l, file, 'sign_words') - - #penalties - if m.penalty_acceptor_boundaries is None: m.penalty_acceptor_boundaries=parse_vector(l, file, 'penalty_acceptor_boundaries') - if m.penalty_acceptor_penalty is None: m.penalty_acceptor_penalty=parse_vector(l, file, 'penalty_acceptor_penalty') - if m.penalty_donor_boundaries is None: m.penalty_donor_boundaries=parse_vector(l, file, 'penalty_donor_boundaries') - if m.penalty_donor_penalty is None: m.penalty_donor_penalty=parse_vector(l, file, 'penalty_donor_penalty') - if m.penalty_coding_len_boundaries is None: m.penalty_coding_len_boundaries=parse_vector(l, file, 'penalty_coding_len_boundaries') - if m.penalty_coding_len_penalty is None: m.penalty_coding_len_penalty=parse_vector(l, file, 'penalty_coding_len_penalty') - if m.penalty_first_coding_len_boundaries is None: m.penalty_first_coding_len_boundaries=parse_vector(l, file, 'penalty_first_coding_len_boundaries') - if m.penalty_first_coding_len_penalty is None: m.penalty_first_coding_len_penalty=parse_vector(l, file, 'penalty_first_coding_len_penalty') - if m.penalty_last_coding_len_boundaries is None: m.penalty_last_coding_len_boundaries=parse_vector(l, file, 'penalty_last_coding_len_boundaries') - if m.penalty_last_coding_len_penalty is None: m.penalty_last_coding_len_penalty=parse_vector(l, file, 'penalty_last_coding_len_penalty') - if m.penalty_single_coding_len_boundaries is None: m.penalty_single_coding_len_boundaries=parse_vector(l, file, 'penalty_single_coding_len_boundaries') - if m.penalty_single_coding_len_penalty is None: m.penalty_single_coding_len_penalty=parse_vector(l, file, 'penalty_single_coding_len_penalty') - if m.penalty_intron_len_boundaries is None: m.penalty_intron_len_boundaries=parse_vector(l, file, 'penalty_intron_len_boundaries') - if m.penalty_intron_len_penalty is None: m.penalty_intron_len_penalty=parse_vector(l, file, 'penalty_intron_len_penalty') - if m.penalty_coding_boundaries is None: m.penalty_coding_boundaries=parse_vector(l, file, 'penalty_coding_boundaries') - if m.penalty_coding_penalty is None: m.penalty_coding_penalty=parse_vector(l, file, 'penalty_coding_penalty') - if m.penalty_coding2_boundaries is None: m.penalty_coding2_boundaries=parse_vector(l, file, 'penalty_coding2_boundaries') - if m.penalty_coding2_penalty is None: m.penalty_coding2_penalty=parse_vector(l, file, 'penalty_coding2_penalty') - if m.penalty_coding3_boundaries is None: m.penalty_coding3_boundaries=parse_vector(l, file, 'penalty_coding3_boundaries') - if m.penalty_coding3_penalty is None: m.penalty_coding3_penalty=parse_vector(l, file, 'penalty_coding3_penalty') - if m.penalty_coding4_boundaries is None: m.penalty_coding4_boundaries=parse_vector(l, file, 'penalty_coding4_boundaries') - if m.penalty_coding4_penalty is None: m.penalty_coding4_penalty=parse_vector(l, file, 'penalty_coding4_penalty') - if m.penalty_intron_boundaries is None: m.penalty_intron_boundaries=parse_vector(l, file, 'penalty_intron_boundaries') - if m.penalty_intron_penalty is None: m.penalty_intron_penalty=parse_vector(l, file, 'penalty_intron_penalty') - if m.penalty_intron2_boundaries is None: m.penalty_intron2_boundaries=parse_vector(l, file, 'penalty_intron2_boundaries') - if m.penalty_intron2_penalty is None: m.penalty_intron2_penalty=parse_vector(l, file, 'penalty_intron2_penalty') - if m.penalty_intron3_boundaries is None: m.penalty_intron3_boundaries=parse_vector(l, file, 'penalty_intron3_boundaries') - if m.penalty_intron3_penalty is None: m.penalty_intron3_penalty=parse_vector(l, file, 'penalty_intron3_penalty') - if m.penalty_intron4_boundaries is None: m.penalty_intron4_boundaries=parse_vector(l, file, 'penalty_intron4_boundaries') - if m.penalty_intron4_penalty is None: m.penalty_intron4_penalty=parse_vector(l, file, 'penalty_intron4_penalty') - if m.penalty_transitions_penalty is None: m.penalty_transitions_penalty=parse_vector(l, file, 'penalty_transitions_penalty') - - #acceptor - if m.acc_splice_b is None: m.acc_splice_b=parse_value(l, 'acc_splice_b') - if m.acc_splice_order is None: m.acc_splice_order=parse_value(l, 'acc_splice_order') - if m.acc_splice_window_left is None: m.acc_splice_window_left=parse_value(l, 'acc_splice_window_left') - if m.acc_splice_window_right is None: m.acc_splice_window_right=parse_value(l, 'acc_splice_window_right') - if m.acc_splice_alphas is None: m.acc_splice_alphas=parse_vector(l, file, 'acc_splice_alphas') - if m.acc_splice_svs is None: m.acc_splice_svs=parse_string(l, file, 'acc_splice_svs') - - #donor - if m.don_splice_b is None: m.don_splice_b=parse_value(l, 'don_splice_b') - if m.don_splice_order is None: m.don_splice_order=parse_value(l, 'don_splice_order') - if m.don_splice_use_gc is None: m.don_splice_use_gc=parse_value(l, 'don_splice_use_gc') - if m.don_splice_window_left is None: m.don_splice_window_left=parse_value(l, 'don_splice_window_left') - if m.don_splice_window_right is None: m.don_splice_window_right=parse_value(l, 'don_splice_window_right') - if m.don_splice_alphas is None: m.don_splice_alphas=parse_vector(l, file, 'don_splice_alphas') - if m.don_splice_svs is None: m.don_splice_svs=parse_string(l, file, 'don_splice_svs') - - l=file.readline() - - sys.stderr.write('done\n') - return m - -def parse_value(line, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - return float(line[line.find('=')+1:-1]) - else: - return None - -def parse_vector(line, file, name): - mat = parse_matrix(line, file, name) - if mat is None: - return mat - else: - mat = array(mat).flatten() - return mat - -def parse_matrix(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - if line.find(']') < 0: - l='' - while l is not None and l.find(']') < 0: - line+=l - l=file.readline() - if l is not None and l.find(']') >= 0: - line+=l - - if line.find(']') < 0: - sys.stderr.write("matrix `" + name + "' ended without ']'\n") - return None - else: - mm = mat(line[line.find('['):line.find(']')+1]) - if len(mm.shape)==1: - mm = reshape(mm.shape[0],1) - return mm - else: - return None - -def parse_string(line, file, name): - if (line.startswith(name)): - sys.stdout.write('.'); sys.stdout.flush() - l='' - lines=[] - while l is not None and l.find(']') < 0: - if l: - lines.append(l[:-1]) - l=file.readline() - - if l.find(']') < 0: - sys.stderr.write("string ended without ']'\n") - return None - else: - return lines - else: - return None - -if __name__ == '__main__': - import bz2 - import sys - import hotshot, hotshot.stats - - def load(): - #f=bz2.BZ2File('data/msplicer_arabidopsis10_gc=1_orf=0.dat.bz2'); - f=file('data/msplicer_arabidopsis10_gc=1_orf=0.dat'); - m=parse_file(f); - - print m.penalty_acceptor_boundaries is None - print m.penalty_acceptor_penalty is None - print m.penalty_donor_boundaries is None - print m.penalty_donor_penalty is None - print m.penalty_coding_len_boundaries is None - print m.penalty_coding_len_penalty is None - print m.penalty_first_coding_len_boundaries is None - print m.penalty_first_coding_len_penalty is None - print m.penalty_last_coding_len_boundaries is None - print m.penalty_last_coding_len_penalty is None - print m.penalty_single_coding_len_boundaries is None - print m.penalty_single_coding_len_penalty is None - print m.penalty_intron_len_boundaries is None - print m.penalty_intron_len_penalty is None - print m.penalty_coding_boundaries is None - print m.penalty_coding_penalty is None - print m.penalty_coding2_boundaries is None - print m.penalty_coding2_penalty is None - print m.penalty_coding3_boundaries is None - print m.penalty_coding3_penalty is None - print m.penalty_coding4_boundaries is None - print m.penalty_coding4_penalty is None - print m.penalty_intron_boundaries is None - print m.penalty_intron_penalty is None - print m.penalty_intron2_boundaries is None - print m.penalty_intron2_penalty is None - print m.penalty_intron3_boundaries is None - print m.penalty_intron3_penalty is None - print m.penalty_intron4_boundaries is None - print m.penalty_intron4_penalty is None - print m.penalty_transitions_penalty is None - - print m.acc_splice_b is None - print m.acc_splice_order is None - print m.acc_splice_window_left is None - print m.acc_splice_window_right is None - print m.acc_splice_alphas is None - print m.acc_splice_svs is None - - print m.don_splice_b is None - print m.don_splice_order is None - print m.don_splice_use_gc is None - print m.don_splice_window_left is None - print m.don_splice_window_right is None - print m.don_splice_alphas is None - print m.don_splice_svs is None - - load() - - #prof = hotshot.Profile("model.prof") - #benchtime = prof.runcall(load) - #prof.close() - #stats = hotshot.stats.load("model.prof") - #stats.strip_dirs() - #stats.sort_stats('time', 'calls') - #stats.print_stats(20) diff --git a/applications/msplicer/msplicer b/applications/msplicer/msplicer deleted file mode 100755 index 03101521cb3..00000000000 --- a/applications/msplicer/msplicer +++ /dev/null @@ -1,355 +0,0 @@ -#!/usr/bin/env python -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2007 Gunnar Raetsch -# Written (W) 2006-2008 Soeren Sonnenburg -# Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -try: - import os - import os.path - import sys - import pickle - import bz2 - import numpy - import optparse - - import genomic - import model - import seqdict - import shogun - - d=shogun.DynProg() - if (d.version.get_version_revision() < 2997): - print - print "ERROR: SHOGUN VERSION 0.6.2 or later required" - print - sys.exit(1) - from content_sensors import content_sensors - from signal_detectors import signal_detectors - from plif import plif -except ImportError, e: - print e - print - print "ERROR IMPORTING MODULES, MAKE SURE YOU HAVE SHOGUN INSTALLED" - print - sys.exit(1) - - -msplicer_version='v0.3' - -class msplicer: - def __init__(self): - self.model = None - self.plif = None - self.signal = None - self.content = None - self.model_name = None - - def load_model(self, filename): - self.model_name = filename - sys.stderr.write('loading model file\n') - f=None - picklefile=filename+'.pickle' - if os.path.isfile(picklefile): - self.model=pickle.load(file(picklefile)) - else: - if filename.endswith('.bz2'): - f=bz2.BZ2File(filename); - else: - f=file(filename); - - self.model=model.parse_file(f) - f.close() - - f=file(picklefile,'w') - pickle.dump(self.model, f) - f.close() - - self.plif=plif(self.model) - self.signal=signal_detectors(self.model) - self.content=content_sensors(self.model) - - def compute_seqmatrix(self, seq): - # start-state: 0 - # exon-start-state: 1 - # donor-state: 2 - # acceptor-state: 3 - # exon-end-state: 4 - # stop-state: 5 - - start_idx = numpy.where(self.model.statedescr == 0)[0] - exon_start_idx = numpy.where(self.model.statedescr == 1)[0] - don_idx = numpy.where(self.model.statedescr == 2)[0] - acc_idx = numpy.where(self.model.statedescr == 3)[0] - exon_stop_idx = numpy.where(self.model.statedescr == 4)[0] - stop_idx = numpy.where(self.model.statedescr == 5)[0] - - # start positions - positions=[(0,0,start_idx)] - positions.append((seq.start,0,exon_start_idx)) - - # end positions - positions.append((seq.end, 0, exon_stop_idx[0])) - if len(exon_stop_idx)>1: - idx = numpy.where(numpy.array(seq.preds['acceptor'].positions,numpy.int32)==seq.end)[0] - if len(idx)==1: - positions.append((seq.end, seq.preds['acceptor'].scores[idx], exon_stop_idx[1])) - positions.append((len(seq.seq)-1,0,stop_idx)) - - # donor posititions - for i in don_idx: - positions.extend(zip(seq.preds['donor'].positions, - seq.preds['donor'].scores, - len(seq.preds['donor'].positions)*[i])) - - # acceptor positions - for i in acc_idx: - positions.extend(zip(seq.preds['acceptor'].positions, - list(seq.preds['acceptor'].scores), - len(seq.preds['acceptor'].positions)*[i])) - - positions.sort(cmp=lambda x,y : int(x[0]-y[0])) - unique_positions= numpy.unique(numpy.array([ x[0] for x in positions ], numpy.int32)) - - seqmatrix= -numpy.infty * numpy.ones((len(self.model.statedescr),len(unique_positions))) - for i in xrange(len(positions)): - p = numpy.where(positions[i][0]==unique_positions)[0] ; - assert(len(p)==1) - p = p[0] ; - seqmatrix[positions[i][2],p]=positions[i][1] - - if len(don_idx)>1: # orf case - for i in xrange(len(unique_positions)): - if seqmatrix[don_idx[0], i] > -1e20: - s1 = seq.seq[unique_positions[i]-1:unique_positions[i]+1] - s2 = seq.seq[unique_positions[i]-2:unique_positions[i]+1] - if s1 in ['TG']: seqmatrix[don_idx[1], i]=-numpy.infty - if s1 not in ['TG']: seqmatrix[don_idx[2], i]=-numpy.infty - if s2 in ['TAG', 'TGG']: seqmatrix[don_idx[3], i]=-numpy.infty - if s2 not in ['TAG']: seqmatrix[don_idx[4], i]=-numpy.infty - if s2 not in ['TGG']: seqmatrix[don_idx[5], i]=-numpy.infty - - if len(acc_idx)>1: # orf case - for i in xrange(len(unique_positions)): - if seqmatrix[acc_idx[0], i] > -1e20: - s1 = seq.seq[unique_positions[i]-1:unique_positions[i]+1] - s2 = seq.seq[unique_positions[i]-1:unique_positions[i]+2] - if s2 in ['GAA', 'GAG', 'GGA']: seqmatrix[acc_idx[2], i]=-numpy.infty - if s1 in ['GA', 'GG']: seqmatrix[acc_idx[4], i]=-numpy.infty - if s1 in ['GA']: seqmatrix[acc_idx[5], i]=-numpy.infty - - plifstatemat = -numpy.ones((len(self.model.statedescr),1), numpy.int32); - plifstatemat[acc_idx,0] = 0 ; # acceptors use first plif - plifstatemat[don_idx,0] = 1 ; # donors use second plif - - return (seqmatrix, unique_positions, plifstatemat) - - - def initialize_dynprog(self, seq): - dyn=shogun.DynProg() - - self.content.initialize_content(dyn) - - n=len(self.model.p) - dyn.set_num_states(n) - dyn.set_p_vector(self.model.p) - dyn.set_q_vector(self.model.q) - dyn.set_a_trans_matrix(self.model.a_trans) - - #design scoring seqmatrix - (seqmatrix, positions, plifstatemat) = self.compute_seqmatrix(seq) - - dyn.best_path_set_seq(seqmatrix) - dyn.best_path_set_pos(positions) - dyn.best_path_set_orf_info(self.model.orf_info) - - dyn.best_path_set_plif_list(self.plif.get_plif_array()) - - dyn.best_path_set_plif_id_matrix(self.model.plifidmat.T) - dyn.best_path_set_plif_state_signal_matrix(plifstatemat) - s=[]; s+=seq.seq; - dyn.best_path_set_single_genestr(numpy.array(s)) - dyn.best_path_set_dict_weights(self.content.get_dict_weights()) - - # self.precompute_content_svm_values(self, dyn, seq, positions) - - return (dyn,positions) - - #def precompute_content_svm_values(self, dyn, seq, positions): - # wordstr=dyn.create_word_string(seq, 1, len(seq)); - # dyn.init_content_svm_value_array(Npos) - # weights = self.content.get_dict_weights() - # #n = size(weights, 1) - # #m = size(weights, 2) - # dyn.precompute_content_values(wordstr, positions, len(positions), len(seq), self.content.get_dict_weights(), n*m); - # dyn.set_genestr_len(len(seq)); - # return (dyn) - - def write_gff(self, outfile, pred, name, score, skipheader): - descr=list() - for i in xrange(pred.shape[0]): - d=dict() - d['seqname']=name - d['source']='msplicer' - d['feature']='exon' - d['start']=pred[i,0]+1 - d['end']=pred[i,1] - d['score']=score - d['strand']='+' - d['frame']=0 - descr.append(d) - - genomic.write_gff(outfile, ('msplicer',msplicer_version + ' ' + self.model_name), - ('DNA', name), descr, skipheader) - - def predict_file(self, fname, (start,end)): - skipheader=False - fasta_dict = genomic.read_fasta(file(fname)) - sys.stderr.write('found fasta file with ' + `len(fasta_dict)` + ' sequence(s)\n') - seqs= seqdict.seqdict(fasta_dict, (start,end)) - - #get donor/acceptor signal predictions for all sequences - self.signal.predict_acceptor_sites_from_seqdict(seqs) - self.signal.predict_donor_sites_from_seqdict(seqs) - - for seq in seqs: - #initialize dynamic programming, with content sensors - #signal detectors, Plifs and HMM like model - (dyn,positions)=self.initialize_dynprog(seq) - - #compute max likely path - dyn.best_path_call(1, self.model.use_orf) - scores=dyn.best_path_get_scores() - states=dyn.best_path_get_states() - pos=dyn.best_path_get_positions() - pred_states=states[0][0:numpy.where(pos[0]==-1)[0]][1:-1] - pred=positions[pos[0][0:numpy.where(pos[0]==-1)[0]][1:-1]] - #print scores - #print pred_states - #print pred - #print len(pred_states) - if (len(pred_states)>0): - if (pred_states[-1]==15): # joint state for acceptor and stop codon - pred_ = numpy.zeros(len(pred)+1, numpy.int32) ; - pred_[0:len(pred)] = pred ; - pred_[-1] = pred[-1] - pred = pred_ - - pred=pred.reshape((len(pred)/2,2)) - self.write_gff(outfile, pred, seq.name, scores, skipheader) - skipheader=True - - if 0: - my_posi = numpy.array([ 1, 400, 408, 451, 1188, 1785, 1858, 2732, 2924, 3869, 3948, 4348 ], numpy.int32)-1 ; - my_pos = numpy.zeros(len(my_posi), numpy.int32) ; - print positions, my_posi - for i in xrange(len(my_posi)): - my_pos[i] = numpy.where(positions == my_posi[i])[0] - - my_states = numpy.array([0, 13, 6, 12, 2, 8, 4, 10, 4, 10, 14, 16], numpy.int32) - #my_pos = numpy.array([ 0, 51, 169, 204, 216, 241, 300, 355, 360, 397], numpy.int32) ; - #my_states = numpy.array([0, 3, 1, 2, 1, 2, 1, 2, 4, 5], numpy.int32) - - my_states = states[0][0:numpy.where(pos[0]==-1)[0]] - my_pos = pos[0][0:numpy.where(pos[0]==-1)[0]] - - print my_states - print my_pos - print positions[my_pos] - - dyn.best_path_set_my_state_seq(my_states) - dyn.best_path_set_my_pos_seq(my_pos) - - dyn.io.set_loglevel(shogun.M_DEBUG) - dyn.best_path_deriv_call() - -def print_version(): - sys.stderr.write('mSplicer '+msplicer_version+'\n') - -def parse_options(): - parser = optparse.OptionParser(usage="usage: %prog [options] seq.fa") - - parser.add_option("-o", "--outfile", type="str", default='stdout', - help="File to write the results to") - parser.add_option("-v", "--version", default=False, - help="Show some more information") - parser.add_option("--start", type="int", default=499, - help="coding start (zero based, relative to sequence start)") - parser.add_option("--stop", type="int", default=-499, - help="""coding stop (zero based, if positive relative to - sequence start, if negative relative to sequence end)""") - parser.add_option("--model", type="str", default='WS160', - help="mSplicer Model to use in predicting") - - (options, args) = parser.parse_args() - if options.version: - print_version() - sys.exit(0) - - if len(args) != 1: - parser.error("incorrect number of arguments") - - fafname=args[0] - if not os.path.isfile(fafname): - parser.error("fasta file does not exist") - - if options.model.endswith('gc'): - gc=1 - model=options.model[:-2] - else: - gc=0 - model=options.model - - if model.startswith('orf'): - orf=1 - model=model[3:] - else: - orf=0 - - modelfname = 'data/msplicer_elegans%s_gc=%d_orf=%d.dat.bz2' % (model, gc, orf) - print "loading model file " + modelfname, - - if not os.path.isfile(modelfname): - print "...not found!\n" - parser.error("""model should be one of: - -WS120, WS120gc, orfWS120, WS150, -WS160, WS160gc, orfWS160gc -""") - - if options.outfile == 'stdout': - outfile=sys.stdout - else: - try: - outfile=file(options.outfile,'w') - except IOError: - parser.error("could not open %s for writing" % options.outfile) - - if options.start<80: - parser.error("--start value must be >=80") - - if options.stop > 0 and options.start >= options.stop - 80: - parser.error("--stop value must be > start + 80") - - if options.stop < 0 and options.stop > -80: - parser.error("--stop value must be <= - 80") - - # shift the start and stop a bit - options.start -= 1 ; - options.stop -= 1 ; - - return ((options.start,options.stop), fafname, modelfname, outfile) - - -if __name__ == '__main__': - dyn=shogun.DynProg() - (startstop, fafname, modelfname, outfile ) = parse_options() - p=msplicer() - p.load_model(modelfname); - p.predict_file(fafname, startstop) diff --git a/applications/msplicer/plif.py b/applications/msplicer/plif.py deleted file mode 100644 index b1abe8c6417..00000000000 --- a/applications/msplicer/plif.py +++ /dev/null @@ -1,224 +0,0 @@ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# Written (W) 2006-2007 Soeren Sonnenburg -# Written (W) 2007 Gunnar Raetsch -# Copyright (C) 2007-2008 Fraunhofer Institute FIRST and Max-Planck-Society -# - -from numpy import array -from shogun import Plif -from shogun import PlifArray -from shogun import DynamicPlifArray - -class plif: - def __init__(self, model): - min_exon_len=2 - min_intron_len=30 - max_len=22222; - #extract plifs from model - l=array(model.penalty_acceptor_boundaries).flatten() - p=array(model.penalty_acceptor_penalty).flatten() - self.acceptor=Plif(len(l)) - self.acceptor.set_plif_limits(l) - self.acceptor.set_plif_penalty(p) - self.acceptor.set_min_value(-1e+20) - self.acceptor.set_max_value(1e+20) - self.acceptor.set_plif_name("acceptor") - - l=array(model.penalty_donor_boundaries).flatten() - p=array(model.penalty_donor_penalty).flatten() - self.donor=Plif(len(l)) - self.donor.set_plif_limits(l) - self.donor.set_plif_penalty(p) - self.donor.set_min_value(-1e+20) - self.donor.set_max_value(1e+20) - self.donor.set_plif_name("donor") - - l=array(model.penalty_coding_len_boundaries).flatten() - p=array(model.penalty_coding_len_penalty).flatten() - self.coding_len=Plif(len(l)) - self.coding_len.set_plif_limits(l) - self.coding_len.set_plif_penalty(p) - self.coding_len.set_min_value(min_exon_len) - self.coding_len.set_max_value(max_len) - self.coding_len.set_plif_name('coding_len') - self.coding_len.set_transform_type("log(+1)") - - l=array(model.penalty_first_coding_len_boundaries).flatten() - p=array(model.penalty_first_coding_len_penalty).flatten() - self.first_coding_len=Plif(len(l)) - self.first_coding_len.set_plif_limits(l) - self.first_coding_len.set_plif_penalty(p) - self.first_coding_len.set_min_value(min_exon_len) - self.first_coding_len.set_max_value(max_len) - self.first_coding_len.set_plif_name("first_coding_len") - self.first_coding_len.set_transform_type("log(+1)") - - l=array(model.penalty_last_coding_len_boundaries).flatten() - p=array(model.penalty_last_coding_len_penalty).flatten() - self.last_coding_len=Plif(len(l)) - self.last_coding_len.set_plif_limits(l) - self.last_coding_len.set_plif_penalty(p) - self.last_coding_len.set_min_value(min_exon_len) - self.last_coding_len.set_max_value(max_len) - self.last_coding_len.set_plif_name('last_coding_len') - self.last_coding_len.set_transform_type("log(+1)") - - l=array(model.penalty_single_coding_len_boundaries).flatten() - p=array(model.penalty_single_coding_len_penalty).flatten() - self.single_coding_len=Plif(len(l)) - self.single_coding_len.set_plif_limits(l) - self.single_coding_len.set_plif_penalty(p) - self.single_coding_len.set_min_value(min_exon_len) - self.single_coding_len.set_max_value(max_len) - self.single_coding_len.set_plif_name('single_coding_len') - self.single_coding_len.set_transform_type("log(+1)") - - l=array(model.penalty_intron_len_boundaries).flatten() - p=array(model.penalty_intron_len_penalty).flatten() - self.intron_len=Plif(len(l)) - self.intron_len.set_plif_limits(l) - self.intron_len.set_plif_penalty(p) - self.intron_len.set_min_value(min_intron_len) - self.intron_len.set_max_value(max_len) - self.intron_len.set_plif_name('intron_len') - self.intron_len.set_transform_type("log(+1)") - - l=array(model.penalty_coding_boundaries).flatten() - p=array(model.penalty_coding_penalty).flatten() - self.coding=Plif(len(l)) - self.coding.set_use_svm(1) - self.coding.set_plif_limits(l) - self.coding.set_plif_penalty(p) - self.coding.set_min_value(-1e+20) - self.coding.set_max_value(1e+20) - self.coding.set_plif_name('coding') - - l=array(model.penalty_coding2_boundaries).flatten() - p=array(model.penalty_coding2_penalty).flatten() - self.coding2=Plif(len(l)) - self.coding2.set_use_svm(3) - self.coding2.set_plif_limits(l) - self.coding2.set_plif_penalty(p) - self.coding2.set_min_value(-1e+20) - self.coding2.set_max_value(1e+20) - self.coding2.set_plif_name('coding2') - - l=array(model.penalty_coding3_boundaries).flatten() - p=array(model.penalty_coding3_penalty).flatten() - self.coding3=Plif(len(l)) - self.coding3.set_use_svm(5) - self.coding3.set_plif_limits(l) - self.coding3.set_plif_penalty(p) - self.coding3.set_min_value(-1e+20) - self.coding3.set_max_value(1e+20) - self.coding3.set_plif_name('coding3') - - l=array(model.penalty_coding4_boundaries).flatten() - p=array(model.penalty_coding4_penalty).flatten() - self.coding4=Plif(len(l)) - self.coding4.set_use_svm(7) - self.coding4.set_plif_limits(l) - self.coding4.set_plif_penalty(p) - self.coding4.set_min_value(-1e+20) - self.coding4.set_max_value(1e+20) - self.coding4.set_plif_name('coding4') - - l=array(model.penalty_intron_boundaries).flatten() - p=array(model.penalty_intron_penalty).flatten() - self.intron=Plif(len(l)) - self.intron.set_use_svm(2) - self.intron.set_plif_limits(l) - self.intron.set_plif_penalty(p) - self.intron.set_min_value(-1e+20) - self.intron.set_max_value(1e+20) - self.intron.set_plif_name('intron') - - l=array(model.penalty_intron2_boundaries).flatten() - p=array(model.penalty_intron2_penalty).flatten() - self.intron2=Plif(len(l)) - self.intron2.set_use_svm(4) - self.intron2.set_plif_limits(l) - self.intron2.set_plif_penalty(p) - self.intron2.set_min_value(-1e+20) - self.intron2.set_max_value(1e+20) - self.intron2.set_plif_name('intron2') - - l=array(model.penalty_intron3_boundaries).flatten() - p=array(model.penalty_intron3_penalty).flatten() - self.intron3=Plif(len(l)) - self.intron3.set_use_svm(6) - self.intron3.set_plif_limits(l) - self.intron3.set_plif_penalty(p) - self.intron3.set_min_value(-1e+20) - self.intron3.set_max_value(1e+20) - self.intron3.set_plif_name('intron3') - - l=array(model.penalty_intron4_boundaries).flatten() - p=array(model.penalty_intron4_penalty).flatten() - self.intron4=Plif(len(l)) - self.intron4.set_use_svm(8) - self.intron4.set_plif_limits(l) - self.intron4.set_plif_penalty(p) - self.intron4.set_min_value(-1e+20) - self.intron4.set_max_value(1e+20) - self.intron4.set_plif_name('intron4') - - p=array(model.penalty_transitions_penalty).flatten() - self.transitions=Plif(len(p)) - self.transitions.set_plif_penalty(p) - self.transitions.set_min_value(-1e+20) - self.transitions.set_max_value(1e+20) - - #create magic plifarrays - self.first_coding_plif_array=PlifArray() - self.first_coding_plif_array.add_plif(self.first_coding_len) - self.first_coding_plif_array.add_plif(self.coding) - self.first_coding_plif_array.add_plif(self.coding2) - self.first_coding_plif_array.add_plif(self.coding3) - self.first_coding_plif_array.add_plif(self.coding4) - - self.last_coding_plif_array=PlifArray() - self.last_coding_plif_array.add_plif(self.last_coding_len) - self.last_coding_plif_array.add_plif(self.coding) - self.last_coding_plif_array.add_plif(self.coding2) - self.last_coding_plif_array.add_plif(self.coding3) - self.last_coding_plif_array.add_plif(self.coding4) - - self.coding_plif_array=PlifArray() - self.coding_plif_array.add_plif(self.coding_len) - self.coding_plif_array.add_plif(self.coding) - self.coding_plif_array.add_plif(self.coding2) - self.coding_plif_array.add_plif(self.coding3) - self.coding_plif_array.add_plif(self.coding4) - - self.single_coding_plif_array=PlifArray() - self.single_coding_plif_array.add_plif(self.single_coding_len) - self.single_coding_plif_array.add_plif(self.coding) - self.single_coding_plif_array.add_plif(self.coding2) - self.single_coding_plif_array.add_plif(self.coding3) - self.single_coding_plif_array.add_plif(self.coding4) - - self.intron_plif_array=PlifArray() - self.intron_plif_array.add_plif(self.intron_len) - self.intron_plif_array.add_plif(self.intron) - self.intron_plif_array.add_plif(self.intron2) - self.intron_plif_array.add_plif(self.intron3) - self.intron_plif_array.add_plif(self.intron4) - - #finally create a single array with all the plifs - self.plif_array=DynamicPlifArray() - self.plif_array.append_element(self.acceptor) - self.plif_array.append_element(self.donor) - self.plif_array.append_element(self.first_coding_plif_array) - self.plif_array.append_element(self.last_coding_plif_array) - self.plif_array.append_element(self.coding_plif_array) - self.plif_array.append_element(self.single_coding_plif_array) - self.plif_array.append_element(self.intron_plif_array) - - def get_plif_array(self): - return self.plif_array diff --git a/applications/msplicer/seqdict.py b/applications/msplicer/seqdict.py deleted file mode 100644 index 18fe75ca469..00000000000 --- a/applications/msplicer/seqdict.py +++ /dev/null @@ -1,68 +0,0 @@ -import string - -class predictions(object): - def __init__(self, positions=None, scores=None): - self.positions=positions - self.scores=scores - - def set_positions(self, positions): - self.positions=positions; - def get_positions(self): - return self.positions - - def set_scores(self, scores): - self.scores=scores - def get_scores(self): - return self.scores - - def __str__(self): - return 'positions: ' + `self.positions` + 'scores: ' + `self.scores` - def __repr__(self): - return self.__str__() - -class sequence(object): - def __init__(self, name, seq, (start,end)): - assert(start-1: - if lself.window_left: - positions.append(l+self.offset) - l=sequence.find(cons, l+1) - - positions.sort() - return positions - - def get_predictions_from_seqdict(self, seqdic, site): - """ we need to generate a huge test features object - containing all locations found in each seqdict-sequence - and each location (this is necessary to efficiently - (==fast,low memory) compute the splice outputs - """ - - seqlen=self.window_right+self.window_left+2 - - num=0 - for s in seqdic: - num+= len(s.preds[site].positions) - - testdat = [] - - for s in seqdic: - sequence=s.seq - positions=s.preds[site].positions - for j in xrange(len(positions)): - i=positions[j] - self.offset - s=sequence[i-self.window_left:i+self.window_right+2] - testdat.append(s) - - t=StringCharFeatures(testdat, DNA) - - self.wd_kernel.init(self.traindat, t) - self.svm.set_kernel(self.wd_kernel) - l=self.svm.apply().get_labels() - sys.stderr.write("\n...done...\n") - - k=0 - for s in seqdic: - num=len(s.preds[site].positions) - scores= num * [0] - for j in xrange(num): - scores[j]=l[k] - k+=1 - s.preds[site].set_scores(scores) - - def get_positions_from_seqdict(self, seqdic, site): - for d in seqdic: - positions=list() - sequence=d.seq - for cons in self.consensus: - l=sequence.find(cons) - while l>-1: - if lself.window_left: - positions.append(l+self.offset) - l=sequence.find(cons, l+1) - positions.sort() - d.preds[site].set_positions(positions) - - def get_predictions(self, sequence, positions): - - seqlen=self.window_right+self.window_left+2 - num=len(positions) - - testdat = [] - - for j in xrange(num): - i=positions[j] - self.offset ; - s=sequence[i-self.window_left:i+self.window_right+2] - testdat.append(s) - - t=StringCharFeatures(DNA) - t.set_string_features(testdat) - - self.wd_kernel.init(self.traindat, t) - l=self.svm.classify().get_labels() - sys.stderr.write("\n...done...\n") - return l - -class signal_detectors(object): - def __init__(self, model): - if model.don_splice_use_gc: - don_consensus=['GC','GT'] - else: - don_consensus=['GT'] - - self.acceptor=svm_splice_model(model.acc_splice_order, model.acc_splice_svs, - numpy.array(model.acc_splice_alphas).flatten(), model.acc_splice_b, - (model.acc_splice_window_left, 2, model.acc_splice_window_right), ['AG']) - self.donor=svm_splice_model(model.don_splice_order, model.don_splice_svs, - numpy.array(model.don_splice_alphas).flatten(), model.don_splice_b, - (model.don_splice_window_left, 0, model.don_splice_window_right), - don_consensus) - - def set_sequence(self, seq): - self.acceptor.set_sequence(seq) - self.donor.set_sequence(seq) - - def predict_acceptor_sites(self, seq): - pos=self.acceptor.get_positions(seq) - sys.stderr.write("computing svm output for acceptor positions\n") - pred=self.acceptor.get_predictions(seq, pos) - return (pos,pred) - - def predict_donor_sites(self,seq): - pos=self.donor.get_positions(seq) - sys.stderr.write("computing svm output for donor positions\n") - pred=self.donor.get_predictions(seq, pos) - return (pos,pred) - - def predict_acceptor_sites_from_seqdict(self, seqs): - self.acceptor.get_positions_from_seqdict(seqs, 'acceptor') - sys.stderr.write("computing svm output for acceptor positions\n") - self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor') - - def predict_donor_sites_from_seqdict(self, seqs): - self.donor.get_positions_from_seqdict(seqs, 'donor') - sys.stderr.write("computing svm output for donor positions\n") - self.donor.get_predictions_from_seqdict(seqs, 'donor') diff --git a/applications/ocr/Ai.py b/applications/ocr/Ai.py deleted file mode 100644 index f8a3284ae52..00000000000 --- a/applications/ocr/Ai.py +++ /dev/null @@ -1,92 +0,0 @@ -# File : $HeadURL$ -# Version: $Id$ - -from shogun import RealFeatures, MulticlassLabels -from shogun import GaussianKernel -from shogun import GMNPSVM - -import numpy as np -import gzip as gz -import pickle as pkl - -import common as com - -class Ai: - def __init__(self): - self.x = None - self.y = None - - self.x_test = None - self.y_test = None - - self.svm = None - - def load_train_data(self, x_fname, y_fname): - Ai.__init__(self) - - self.x = np.loadtxt(x_fname) - self.y = np.loadtxt(y_fname) - 1.0 - - self.x_test = self.x - self.y_test = self.y - - def _svm_new(self, kernel_width, c, epsilon): - if self.x == None or self.y == None: - raise Exception("No training data loaded.") - - x = RealFeatures(self.x) - y = MulticlassLabels(self.y) - - self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y) - self.svm.set_epsilon(epsilon) - - def write_svm(self): - gz_stream = gz.open(com.TRAIN_SVM_FNAME_GZ, 'wb', 9) - pkl.dump(self.svm, gz_stream) - gz_stream.close() - - def read_svm(self): - gz_stream = gz.open(com.TRAIN_SVM_FNAME_GZ, 'rb') - self.svm = pkl.load(gz_stream) - gz_stream.close() - - def enable_validation(self, train_frac): - x = self.x - y = self.y - - idx = np.arange(len(y)) - np.random.shuffle(idx) - train_idx=idx[:np.floor(train_frac*len(y))] - test_idx=idx[np.ceil(train_frac*len(y)):] - - self.x = x[:,train_idx] - self.y = y[train_idx] - self.x_test = x[:,test_idx] - self.y_test = y[test_idx] - - def train(self, kernel_width, c, epsilon): - self._svm_new(kernel_width, c, epsilon) - - x = RealFeatures(self.x) - self.svm.io.enable_progress() - self.svm.train(x) - self.svm.io.disable_progress() - - def load_classifier(self): self.read_svm() - - def classify(self, matrix): - cl = self.svm.apply( - RealFeatures( - np.reshape(matrix, newshape=(com.FEATURE_DIM, 1), - order='F') - ) - ).get_label(0) - - return int(cl + 1.0) % 10 - - def get_test_error(self): - self.svm.io.enable_progress() - l = self.svm.apply(RealFeatures(self.x_test)).get_labels() - self.svm.io.disable_progress() - - return 1.0 - np.mean(l == self.y_test) diff --git a/applications/ocr/FigureWidget.py b/applications/ocr/FigureWidget.py deleted file mode 100644 index c90f59bd74b..00000000000 --- a/applications/ocr/FigureWidget.py +++ /dev/null @@ -1,147 +0,0 @@ -# File : $HeadURL$ -# Version: $Id$ - -import gtk -import numpy as np - -import common as com -from QuadrWidget import QuadrWidget - -class FigureWidget(QuadrWidget): - THICKNESS_FRAC = 0.03 - - def __init__(self, go_func, go_args): - QuadrWidget.__init__(self) - self.add_events(gtk.gdk.ALL_EVENTS_MASK) - - self.drag = False - self.coords = [] - - self.go_func = go_func - self.go_args = go_args - - self.connect("expose_event", FigureWidget.on_redraw) - self.connect("button-press-event", FigureWidget.on_press) - self.connect("button-release-event", FigureWidget.on_release) - self.connect("motion-notify-event", FigureWidget.on_motion) - self.connect("realize", FigureWidget.on_realize) - - def on_realize(self): - self.window.set_cursor(gtk.gdk.Cursor(gtk.gdk.CROSSHAIR)) - - def on_press(self, event): - if event.button == com.BUTTON_RIGHT: - self.clear_coords() - self.drag = False - - if event.button == com.BUTTON_MIDDLE: - self.go_func(*self.go_args) - self.drag = False - - if event.button != com.BUTTON_LEFT: - return False - - self.drag = True - self.coords.insert(0, []) - - # Two times to add a zero-length line - self.on_motion(event) - self.on_motion(event) - - return False - - def on_release(self, event): - if event.button != com.BUTTON_LEFT: - return False - - self.drag = False - return False - - def on_motion(self, event): - if not self.drag: - return False - - width = self.window.get_size()[0] - height = self.window.get_size()[1] - x = event.x/width if event.x < width else com.NEAR_ONE_NEG - y = event.y/height if event.y < height else com.NEAR_ONE_NEG - x = 0 if x < 0 else x - y = 0 if y < 0 else y - - self.coords[0].append((x, y)) - - self.update() - - return False - - def on_redraw(self, event): - gc = self.style.fg_gc[self.state] - w = self.window - width = w.get_size()[0] - height = w.get_size()[1] - - # Backup graphic context - self.default_fg = gc.foreground - self.line_width = gc.line_width - self.line_style = gc.line_style - self.cap_style = gc.cap_style - self.join_style = gc.join_style - - # Background - gc.set_rgb_fg_color(com.COLOR_WHITE) - w.draw_rectangle(gc, True, 0, 0, width-1, height-1) - gc.set_rgb_fg_color(com.COLOR_BLACK) - w.draw_rectangle(gc, False, 0, 0, width-1, height-1) - - # Data - gc.set_line_attributes(int(height*self.THICKNESS_FRAC), - gtk.gdk.LINE_SOLID, gtk.gdk.CAP_ROUND, - gtk.gdk.JOIN_ROUND) - for poly in self.coords: - w.draw_lines(gc, map( - lambda coord: - (int(coord[0]*width), int(coord[1]*height)), - poly)) - - # Recovering graphic context - gc.line_width = self.line_width - gc.line_style = self.line_style - gc.cap_style = self.cap_style - gc.join_style = self.join_style - gc.foreground = self.default_fg - - return False - - def get_coords(self): - result = map(lambda line: np.array(line), self.coords) - - result = map(lambda line: np.transpose(line), result) - - minx = 2.0 - miny = 2.0 - for line in result: - minx = min(minx, min(line[0])) - miny = min(miny, min(line[1])) - for line in result: - line[0] -= minx - line[1] -= miny - - maxxy = 0.0 - for line in result: maxxy = max(maxxy, line.max()) - for line in result: line /= maxxy + com.NEAR_ZERO_POS - - maxx = 0.0 - maxy = 0.0 - for line in result: - maxx = max(maxx, max(line[0])) - maxy = max(maxy, max(line[1])) - for line in result: - line[0] += (1 - maxx)/2 - line[1] += (1 - maxy)/2 - - result = map(lambda line: np.transpose(line), result) - return result - - def clear_coords(self): - self.coords = [] - self.update() diff --git a/applications/ocr/MatrixWidget.py b/applications/ocr/MatrixWidget.py deleted file mode 100644 index b4198f646d8..00000000000 --- a/applications/ocr/MatrixWidget.py +++ /dev/null @@ -1,66 +0,0 @@ -# File : $HeadURL$ -# Version: $Id$ - -import gtk -import numpy as np - -import common as com -from QuadrWidget import QuadrWidget - -class MatrixWidget(QuadrWidget): - def __init__(self, matrix_size): - QuadrWidget.__init__(self) - - self.matrix = np.zeros((matrix_size, matrix_size), - dtype=np.bool) - - self.connect("expose_event", MatrixWidget.on_redraw) - - def on_redraw(self, event): - gc = self.style.fg_gc[self.state] - w = self.window - width = w.get_size()[0] - height = w.get_size()[1] - - # Backup graphic context - self.default_fg = gc.foreground - - # Background - gc.set_rgb_fg_color(com.COLOR_WHITE) - w.draw_rectangle(gc, True, 0, 0, width-1, height-1) - - size_y = self.matrix.shape[0] - size_x = self.matrix.shape[1] - pixels_per_y = float(height)/size_y - pixels_per_x = float(width)/size_x - gc.set_rgb_fg_color(com.COLOR_GRAY) - for y in range(size_y): - w.draw_line(gc, 0, int(y*pixels_per_y), - width-1, int(y*pixels_per_y)) - for x in range(size_x): - if y == 0: - w.draw_line(gc, int(x*pixels_per_x), 0, - int(x*pixels_per_x), height-1) - if self.matrix[y, x]: - gc.set_rgb_fg_color(com.COLOR_BLACK) - w.draw_rectangle(gc, self.matrix[y, x] - > com.NEAR_ZERO_POS, - int(x*pixels_per_x), - int(y*pixels_per_y), - int(pixels_per_x+1), - int(pixels_per_y+1)) - gc.set_rgb_fg_color(com.COLOR_GRAY) - - gc.set_rgb_fg_color(com.COLOR_BLACK) - w.draw_rectangle(gc, False, 0, 0, width-1, height-1) - - gc.foreground = self.default_fg - - return False - - def set_image(self, image): - self.matrix = image - self.update() - - def get_image(self): - return self.matrix diff --git a/applications/ocr/QuadrWidget.py b/applications/ocr/QuadrWidget.py deleted file mode 100644 index 41ccceeb6c2..00000000000 --- a/applications/ocr/QuadrWidget.py +++ /dev/null @@ -1,30 +0,0 @@ -# File : $HeadURL$ -# Version: $Id$ - -import gtk - -class QuadrWidget(gtk.DrawingArea): - - # No SELF.CONNECT because we like to prevent the call of - # gtk.DrawingArea.do_size_allocate() - __gsignals__ = {"size-allocate": 'override'} - - def __init__(self): - gtk.DrawingArea.__init__(self) - - def update(self): - width = self.window.get_size()[0] - height = self.window.get_size()[1] - self.window.invalidate_rect(gtk.gdk.Rectangle( - 0, 0, width, height), False) - #self.window.process_updates(False) - - def do_size_allocate(self, allocation): - if allocation.width < allocation.height: - allocation.y += (allocation.height-allocation.width)/2 - allocation.height = allocation.width - elif allocation.width > allocation.height: - allocation.x += (allocation.width-allocation.height)/2 - allocation.width = allocation.height - - gtk.DrawingArea.do_size_allocate(self, allocation) diff --git a/applications/ocr/README b/applications/ocr/README deleted file mode 100644 index 818e073f7d4..00000000000 --- a/applications/ocr/README +++ /dev/null @@ -1,9 +0,0 @@ -This example illustrates how to do ocr of handwritten digits. - -./predict - Starts a GUI where one can draw digits with the mouse. - Pressing the classify button will detect the drawn digit - (so does pressing the middle mouse button). Pressing the right - mouse button will clear the window. - -./train - Does model selection using a SVM with a Gaussian kernel and saves the - 'best' model. diff --git a/applications/ocr/common.py b/applications/ocr/common.py deleted file mode 100644 index c28e9bdb409..00000000000 --- a/applications/ocr/common.py +++ /dev/null @@ -1,31 +0,0 @@ -# File : $HeadURL$ -# Version: $Id$ - -import gtk - -UINT16_MAX = 0xffff -UINT_GRAY = UINT16_MAX - UINT16_MAX/4 -COLOR_BLACK = gtk.gdk.Color(0, 0, 0) -COLOR_WHITE = gtk.gdk.Color(UINT16_MAX, UINT16_MAX, UINT16_MAX) -COLOR_GRAY = gtk.gdk.Color(UINT_GRAY, UINT_GRAY, UINT_GRAY) -COLOR_BLUE = gtk.gdk.Color(UINT_GRAY, UINT_GRAY, UINT16_MAX) - -BUTTON_LEFT = 1 -BUTTON_MIDDLE = 2 -BUTTON_RIGHT = 3 - -NEAR_ZERO_POS = 1e-8 -NEAR_ONE_NEG = 1-NEAR_ZERO_POS - -TRAIN_X_FNAME = "data/train_data_x.asc.gz" -TRAIN_Y_FNAME = "data/train_data_y.asc.gz" - -TRAIN_SVM_FNAME_GZ = "data/ocr.svm.gz" - -MATIX_IMAGE_SIZE = 16 -FEATURE_DIM = MATIX_IMAGE_SIZE * MATIX_IMAGE_SIZE - -HISTORY_WIDTH = 5 -HISTORY_HEIGHT = 2 - -FEATURE_RANGE_MAX = 1.0 diff --git a/applications/ocr/data b/applications/ocr/data deleted file mode 120000 index ee220ad1a1b..00000000000 --- a/applications/ocr/data +++ /dev/null @@ -1 +0,0 @@ -../../data/ocr \ No newline at end of file diff --git a/applications/ocr/predict b/applications/ocr/predict deleted file mode 100755 index 43a5eae20b2..00000000000 --- a/applications/ocr/predict +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python - -# File : $HeadURL$ -# Version: $Id$ - -import gtk, sys -import numpy as np - -from FigureWidget import FigureWidget -from MatrixWidget import MatrixWidget -from Ai import Ai - -import common as com - -def _draw_line(image, start, end): - start = np.array(start, dtype=np.int) - end = np.array(end, dtype=np.int) - - delta = abs(end - start) - - e = delta[0]/2.0 - x, y = start - image[y, x] = com.FEATURE_RANGE_MAX - while np.any((x, y) != end): - if e < 0.0 or x == end[0]: - y += -1 if start[1] > end[1] else 1 - e += delta[0] - if e >= 0.0 and x != end[0]: - x += -1 if start[0] > end[0] else 1 - e -= delta[1] - image[y, x] = com.FEATURE_RANGE_MAX - -def button_go_clicked(button, main_window): - coords = map(lambda line: com.MATIX_IMAGE_SIZE*line, - main_window.figure.get_coords()) - image = np.zeros((com.MATIX_IMAGE_SIZE, com.MATIX_IMAGE_SIZE), - dtype=np.float) - - for line in coords: - for i in range(line.shape[0]-1): - _draw_line(image, line[i], line[i+1]) - - main_window.push_image(image, - str(main_window.ai.classify(image)) - ) - - main_window.figure.clear_coords() - main_window.set_focus(main_window.button_go) - -def button_clear_clicked(button, main_window): - main_window.figure.clear_coords() - main_window.set_focus(main_window.button_go) - -class MainWindow(gtk.Window): - TITLE = "OCR Demo - Press middle mouse button to classify, right" \ - " mouse button to clear" - - MIN_WIDTH = 800 - MIN_HEIGHT = 260 - - MAIN_PADDING = 4 - BOX_PADDING = 4 - - MARKUP_PRE = '' - MARKUP_POST = '' - - def __init__(self): - # Main Window - gtk.Window.__init__(self, type=gtk.WINDOW_TOPLEVEL) - self.set_title(self.TITLE) - self.connect("delete-event", MainWindow.on_delete) - self.set_size_request(self.MIN_WIDTH, self.MIN_HEIGHT) - - # AI - self.ai = Ai() - self.ai.load_classifier() - - # Main Container - self.main_align = gtk.Alignment(xalign=0.0, yalign=0.0, - xscale=1.0, yscale=1.0) - self.main_align.set_padding(self.MAIN_PADDING, - self.MAIN_PADDING, - self.MAIN_PADDING, - self.MAIN_PADDING) - self.add(self.main_align) - - # Figure HBox - self.figure_hbox = gtk.HBox(homogeneous=False, - spacing=self.BOX_PADDING) - self.main_align.add(self.figure_hbox) - - # Main VBox - self.main_vbox = gtk.VBox(homogeneous=False, - spacing=self.BOX_PADDING) - self.figure_hbox.add(self.main_vbox) - - - # Figure - self.figure = FigureWidget(button_go_clicked, (None, self)) - self.main_vbox.pack_start(self.figure, expand=True, fill=True, - padding=0) - - # VSeperator - self.figure_vsep = gtk.VSeparator() - self.figure_hbox.pack_start(self.figure_vsep, expand=False, - fill=True, padding=0) - - # History - self.mat_table = gtk.Table(rows=com.HISTORY_HEIGHT, - columns=com.HISTORY_WIDTH, - homogeneous=True) - self.figure_hbox.pack_start(self.mat_table, expand=True, - fill=True, padding=0) - - self.mat_frame = [] - self.mat_vbox = [] - self.mat_images = [] - self.mat_result = [] - for y in range(com.HISTORY_HEIGHT): - for x in range(com.HISTORY_WIDTH): - i = y*com.HISTORY_WIDTH + x - - self.mat_frame.append(gtk.Frame( - "History " + str(y*com.HISTORY_WIDTH + x))) - if i == 0: - self.mat_frame[i].set_label("Current") - mf_style = self.mat_frame[i].get_style() - mf_style.bg[gtk.STATE_NORMAL] = com.COLOR_BLUE - self.mat_frame[i].set_style(mf_style) - self.mat_table.attach(self.mat_frame[i], - left_attach=x, right_attach=x+1, - top_attach=y, bottom_attach=y+1, - xpadding=self.BOX_PADDING) - - self.mat_vbox.append(gtk.VBox(homogeneous=False, - spacing=self.BOX_PADDING) - ) - self.mat_frame[i].add(self.mat_vbox[i]) - - self.mat_result.append(gtk.Label( - self.MARKUP_PRE + "?" + self.MARKUP_POST)) - self.mat_result[i].set_use_markup(True) - self.mat_vbox[i].pack_start( - self.mat_result[i], expand=False, fill=True, - padding=0) - - self.mat_images.append(MatrixWidget( - com.MATIX_IMAGE_SIZE)) - self.mat_vbox[i].pack_start( - self.mat_images[i], expand=True, fill=True, - padding=self.BOX_PADDING) - - # HSeperator - self.main_hsep = gtk.HSeparator() - self.main_vbox.pack_start(self.main_hsep, expand=False, - fill=True, padding=0) - - # HBox - self.hbox = gtk.HBox(homogeneous=False, - spacing=self.BOX_PADDING) - self.main_vbox.pack_end(self.hbox, expand=False, fill=True, - padding=0) - - # Button Go - self.button_go = gtk.Button(label="_Classify") - self.button_go.connect("clicked", button_go_clicked, self) - self.button_go.set_focus_on_click(False) - self.hbox.add(self.button_go) - - # Button clear - self.button_clear = gtk.Button(label="Clea_r") - self.button_clear.set_focus_on_click(False) - self.button_clear.connect("clicked", button_clear_clicked, - self) - self.hbox.add(self.button_clear) - - def on_delete(self, event): - gtk.Window.destroy(self) - gtk.main_quit() - return True - - def push_image(self, image, str): - prev_image = image - prev_str = str - - for i in range(com.HISTORY_WIDTH*com.HISTORY_HEIGHT): - tmp_image = self.mat_images[i].get_image() - tmp_str = self.mat_result[i].get_text() - self.mat_images[i].set_image(prev_image) - self.mat_result[i].set_markup( - self.MARKUP_PRE + prev_str + self.MARKUP_POST) - prev_image = tmp_image - prev_str = tmp_str - -def main(argv): - gtk.gdk.threads_init() - - window = MainWindow() - window.show_all() - gtk.main() - -if __name__ == '__main__': - sys.exit(main(sys.argv)) diff --git a/applications/ocr/train b/applications/ocr/train deleted file mode 100755 index c3b62df6915..00000000000 --- a/applications/ocr/train +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python - -# File : $HeadURL$ -# Version: $Id$ - -import sys - -from Ai import Ai -import common as com - -EPSILON = 1e-3 - -# optimal epsilon=1.00e-03, C=2.50, kernel_width=24.00 - -C_LIST = [2.5, 3.0, 3.5] -KERNEL_WIDTH_LIST = [22.0, 24.0, 26.0] - -VALIDATION_TRAIN_FRAC = 0.80 - -def to_str(e): - result = "" if e['error'] > 1.0 else "error=%.4f, " % e['error'] - - return "%sepsilon=%.2e, C=%.2f, kernel_width=%.2f" % ( - result, e['epsilon'], e['c'], e['kernel_width']) - -def main(argv): - ai = Ai() - ai.load_train_data(com.TRAIN_X_FNAME, com.TRAIN_Y_FNAME) - ai.enable_validation(VALIDATION_TRAIN_FRAC) - - best_error = {'error': 2.0, 'epsilon': -1.0, 'c': -1.0, - 'kernel_width': -1.0} - for kernel_width in KERNEL_WIDTH_LIST: - for c in C_LIST: - cur = {'error': 2.0, 'epsilon': EPSILON, 'c': c, - 'kernel_width': kernel_width} - - print "Trying: %s" % to_str(cur) - - ai.train(kernel_width=kernel_width, c=c, - epsilon=EPSILON) - print "" - cur['error'] = ai.get_test_error() - print "" - - if cur['error'] < best_error['error']: - best_error = cur - print "New best: %s" % to_str(best_error) - else: - print "Result: %s" % to_str(cur) - print "Best: %s" % to_str(best_error) - - print "" - - print "Finally using parameters: %s" % to_str(best_error) - ai.load_train_data(com.TRAIN_X_FNAME, com.TRAIN_Y_FNAME) - ai.train(kernel_width=best_error['kernel_width'], - c=best_error['c'], epsilon=best_error['epsilon']) - ai.write_svm() - print "" - print "Finished :DD" - -if __name__ == '__main__': - sys.exit(main(sys.argv)) diff --git a/applications/tapkee/faces_embedding.py b/applications/tapkee/faces_embedding.py deleted file mode 100644 index 24cf327031d..00000000000 --- a/applications/tapkee/faces_embedding.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# Written (W) 2011 Sergey Lisitsyn -# Copyright (C) 2011 Sergey Lisitsyn - -from shogun import * -from numpy import * -from matplotlib.offsetbox import TextArea, DrawingArea, OffsetImage, AnnotationBbox -import re,os,time -from pylab import * - -def build_features(path): - files = os.listdir(path) - files.remove('README') - N = len(files) - (nd,md) = imread(os.path.join(path,files[0])).shape - dim = nd*md - feature_matrix = zeros([dim,N]) - for i,filename in enumerate(files): - feature_matrix[:,i] = imread(os.path.join(path,filename)).ravel() - return nd,md,RealFeatures(feature_matrix) - -path = '../../data/faces/' -converter = DiffusionMaps -nd,md,features = build_features(path) -converter_instance = converter() -converter_instance.set_t(5) -converter_instance.set_target_dim(2) - -start = time.time() -new_features = converter_instance.embed(features).get_feature_matrix() -print new_features.shape -end = time.time() - -clusterer = KMeans -clusterer_instance = clusterer(2,EuclideanDistance()) -clusterer_instance.train(features) -labels = clusterer_instance.apply().get_labels() -print labels - -print 'applied %s, took %fs' % (converter_instance.get_name(), end-start) -print 'plotting' - -fig = figure() -ax = fig.add_subplot(111,axisbg='#ffffff') -ax.scatter(new_features[0],new_features[1],color='black') -import random -for i in range(len(new_features[0])): - feature_vector = features.get_feature_vector(i) - Z = zeros([nd,md,4]) - Z[:,:,0] = 255-feature_vector.reshape(nd,md)[::-1,:] - Z[:,:,1] = Z[:,:,0] - Z[:,:,2] = Z[:,:,0] - for k in range(nd): - for j in range(md): - Z[k,j,3] = pow(sin(k*pi/nd)*sin(j*pi/md),0.5) - imagebox = OffsetImage(Z,cmap=cm.gray,zoom=0.25) - ab = AnnotationBbox(imagebox, (new_features[0,i],new_features[1,i]), - pad=0.001,frameon=False) - ax.add_artist(ab) -axis('off') -savefig('faces.png') -show() diff --git a/applications/tapkee/octave_ltsa.m b/applications/tapkee/octave_ltsa.m deleted file mode 100644 index f79e000146f..00000000000 --- a/applications/tapkee/octave_ltsa.m +++ /dev/null @@ -1,11 +0,0 @@ -n = 1000; -noise = 0.0; -t = (3 * pi / 2) * (1 + 2 * rand(n, 1)); -height = 30 * rand(n, 1); -X = [t .* cos(t) height t .* sin(t)] + noise * randn(n, 3); - -sg('set_features','TRAIN',X'); -sg('set_converter','ltsa',10); -embedding = sg('embed',2); -plot(embedding(:,1),embedding(:,2),'@'); - diff --git a/applications/tapkee/samples/data.py b/applications/tapkee/samples/data.py deleted file mode 100644 index 2fc91d07c23..00000000000 --- a/applications/tapkee/samples/data.py +++ /dev/null @@ -1,9 +0,0 @@ -import numpy - -def swissroll(N=1000): - tt = numpy.array((5*numpy.pi/4)*(1+2*numpy.random.rand(N))) - height = numpy.array((numpy.random.rand(N)-0.5)) - noise = 0.0 - X = numpy.array([(tt+noise*numpy.random.randn(N))*numpy.cos(tt), 10*height, (tt+noise*numpy.random.randn(N))*numpy.sin(tt)]) - return X - diff --git a/applications/tapkee/samples/dm.py b/applications/tapkee/samples/dm.py deleted file mode 100644 index 036028dfd44..00000000000 --- a/applications/tapkee/samples/dm.py +++ /dev/null @@ -1,33 +0,0 @@ -import shogun as sg -import data -import numpy as np - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Diffusion Maps converter instance -converter = sg.DiffusionMaps() - -# set target dimensionality -converter.set_target_dim(2) -# set number of time-steps -converter.set_t(2) -# set width of gaussian kernel -converter.set_width(10.0) - -# create euclidean distance instance -distance = sg.EuclideanDistance() -# enable converter instance to use created distance instance -converter.set_distance(distance) - -# compute embedding with Diffusion Maps method -embedding = converter.embed(features) - -# compute custom distance matrix -distance_matrix = np.exp(-np.dot(feature_matrix.T,feature_matrix)) -# create Custom Kernel instance -custom_distance = sg.CustomDistance(distance_matrix) -# construct embedding based on created distance -distance_embedding = converter.embed_distance(custom_distance) diff --git a/applications/tapkee/samples/hlle.py b/applications/tapkee/samples/hlle.py deleted file mode 100644 index 0dc860931c0..00000000000 --- a/applications/tapkee/samples/hlle.py +++ /dev/null @@ -1,22 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Hessian Locally Linear Embedding converter instance -converter = sg.HessianLocallyLinearEmbedding() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# compute embedding with Hessian Locally Linear Embedding method -embedding = converter.embed(features) diff --git a/applications/tapkee/samples/isomap.py b/applications/tapkee/samples/isomap.py deleted file mode 100644 index 1d0569e1662..00000000000 --- a/applications/tapkee/samples/isomap.py +++ /dev/null @@ -1,39 +0,0 @@ -import shogun as sg -import data -import numpy as np - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Isomap converter instance -converter = sg.Isomap() - -# set number of neighbors to be used -converter.set_k(15) - -# set target dimensionality -converter.set_target_dim(2) - -# compute embedding with Isomap method -embedding = converter.embed(features) - -# set number of threads -converter.parallel.set_num_threads(2) -# compute approximate embedding -approx_embedding = converter.embed(features) -# disable landmark approximation -converter.set_landmark(False) - -# compute cosine distance matrix 'manually' -N = features.get_num_vectors() -distance_matrix = np.zeros((N,N)) -for i in range(N): - for j in range(N): - distance_matrix[i,j] = \ - np.linalg.norm(feature_matrix[:,i]-feature_matrix[:,j],2) -# create custom distance instance -distance = sg.CustomDistance(distance_matrix) -# construct embedding based on created distance -converter.embed_distance(distance) diff --git a/applications/tapkee/samples/klle.py b/applications/tapkee/samples/klle.py deleted file mode 100644 index 77b9908cd68..00000000000 --- a/applications/tapkee/samples/klle.py +++ /dev/null @@ -1,35 +0,0 @@ -import shogun as sg -import data -import numpy as np - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Kernel Locally Linear Embedding converter instance -converter = sg.KernelLocallyLinearEmbedding() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# create Gaussian kernel instance -kernel = sg.GaussianKernel(100,10.0) -# enable converter instance to use created kernel instance -converter.set_kernel(kernel) - -# compute embedding with Kernel Locally Linear Embedding method -embedding = converter.embed(features) - -# compute linear kernel matrix -kernel_matrix = np.dot(feature_matrix.T,feature_matrix) -# create Custom Kernel instance -custom_kernel = sg.CustomKernel(kernel_matrix) -# construct embedding based on created kernel -kernel_embedding = converter.embed_kernel(custom_kernel) diff --git a/applications/tapkee/samples/la.py b/applications/tapkee/samples/la.py deleted file mode 100644 index b125fcc67c8..00000000000 --- a/applications/tapkee/samples/la.py +++ /dev/null @@ -1,33 +0,0 @@ -import shogun as sg -import data -import numpy as np - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Laplacian Eigenmaps converter instance -converter = sg.LaplacianEigenmaps() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(20) -# set tau multiplier -converter.set_tau(1.0) - -# compute embedding with Laplacian Eigenmaps method -embedding = converter.embed(features) - -# compute cosine distance matrix 'manually' -N = features.get_num_vectors() -distance_matrix = np.zeros((N,N)) -for i in range(N): - for j in range(N): - distance_matrix[i,j] = \ - np.linalg.norm(feature_matrix[:,i]-feature_matrix[:,j],2) -# create custom distance instance -distance = sg.CustomDistance(distance_matrix) -# construct embedding based on created distance -converter.embed_distance(distance) diff --git a/applications/tapkee/samples/lle.py b/applications/tapkee/samples/lle.py deleted file mode 100644 index f0a05dd4599..00000000000 --- a/applications/tapkee/samples/lle.py +++ /dev/null @@ -1,28 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Locally Linear Embedding converter instance -converter = sg.LocallyLinearEmbedding() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set reconstruction shift (optional) -converter.set_reconstruction_shift(1e-3) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# compute embedding with Locally Linear Embedding method -embedding_first = converter.embed(features) - -# set number of neighbors to be used -converter.set_k(50) - -# compute embedding with Locally Linear Embedding method -embedding_second = converter.embed(features) diff --git a/applications/tapkee/samples/lltsa.py b/applications/tapkee/samples/lltsa.py deleted file mode 100644 index f59a3fb2ea3..00000000000 --- a/applications/tapkee/samples/lltsa.py +++ /dev/null @@ -1,22 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Linear Local Tangent Space Alignment converter instance -converter = sg.LinearLocalTangentSpaceAlignment() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# compute embedding with Linear Local Tangent Space Alignment method -embedding = converter.embed(features) diff --git a/applications/tapkee/samples/lpp.py b/applications/tapkee/samples/lpp.py deleted file mode 100644 index 9c9c4e87f37..00000000000 --- a/applications/tapkee/samples/lpp.py +++ /dev/null @@ -1,20 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Locality Preserving Projections converter instance -converter = sg.LocalityPreservingProjections() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) - -# compute embedding with Locality Preserving Projections method -embedding = converter.embed(features) diff --git a/applications/tapkee/samples/ltsa.py b/applications/tapkee/samples/ltsa.py deleted file mode 100644 index 2c2d760cd4a..00000000000 --- a/applications/tapkee/samples/ltsa.py +++ /dev/null @@ -1,22 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Local Tangent Space Alignment converter instance -converter = sg.LocalTangentSpaceAlignment() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# compute embedding with Local Tangent Space Alignment method -embedding = converter.embed(features) diff --git a/applications/tapkee/samples/mds.py b/applications/tapkee/samples/mds.py deleted file mode 100644 index 4f0f8ffe633..00000000000 --- a/applications/tapkee/samples/mds.py +++ /dev/null @@ -1,40 +0,0 @@ -import shogun as sg -import data -import numpy as np - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Multidimensional Scaling converter instance -converter = sg.MultidimensionalScaling() - -# set target dimensionality -converter.set_target_dim(2) - -# compute embedding with Multidimensional Scaling method -embedding = converter.embed(features) - -# enable landmark approximation -converter.set_landmark(True) -# set number of landmarks -converter.set_landmark_number(100) -# set number of threads -converter.parallel.set_num_threads(2) -# compute approximate embedding -approx_embedding = converter.embed(features) -# disable landmark approximation -converter.set_landmark(False) - -# compute cosine distance matrix 'manually' -N = features.get_num_vectors() -distance_matrix = np.zeros((N,N)) -for i in range(N): - for j in range(N): - distance_matrix[i,j] = \ - np.linalg.norm(feature_matrix[:,i]-feature_matrix[:,j],2) -# create custom distance instance -distance = sg.CustomDistance(distance_matrix) -# construct embedding based on created distance -converter.embed_distance(distance) diff --git a/applications/tapkee/samples/npe.py b/applications/tapkee/samples/npe.py deleted file mode 100644 index cefa4880bdf..00000000000 --- a/applications/tapkee/samples/npe.py +++ /dev/null @@ -1,22 +0,0 @@ -import shogun as sg -import data - -# load data -feature_matrix = data.swissroll() -# create features instance -features = sg.RealFeatures(feature_matrix) - -# create Neighborhood Preserving Embedding converter instance -converter = sg.NeighborhoodPreservingEmbedding() - -# set target dimensionality -converter.set_target_dim(2) -# set number of neighbors -converter.set_k(10) -# set number of threads -converter.parallel.set_num_threads(2) -# set nullspace shift (optional) -converter.set_nullspace_shift(-1e-6) - -# compute embedding with Neighborhood Preserving Projections method -embedding = converter.embed(features) diff --git a/applications/tapkee/swissroll_embedding.py b/applications/tapkee/swissroll_embedding.py deleted file mode 100644 index e83d69d71a5..00000000000 --- a/applications/tapkee/swissroll_embedding.py +++ /dev/null @@ -1,88 +0,0 @@ -import numpy -numpy.random.seed(40) -tt = numpy.genfromtxt('../../data/toy/swissroll_color.dat',unpack=True).T -X = numpy.genfromtxt('../../data/toy/swissroll.dat',unpack=True).T -N = X.shape[1] -converters = [] - -from shogun import LocallyLinearEmbedding -lle = LocallyLinearEmbedding() -lle.set_k(9) -converters.append((lle, "LLE with k=%d" % lle.get_k())) - -from shogun import MultidimensionalScaling -mds = MultidimensionalScaling() -converters.append((mds, "Classic MDS")) - -lmds = MultidimensionalScaling() -lmds.set_landmark(True) -lmds.set_landmark_number(20) -converters.append((lmds,"Landmark MDS with %d landmarks" % lmds.get_landmark_number())) - -from shogun import Isomap -cisomap = Isomap() -cisomap.set_k(9) -converters.append((cisomap,"Isomap with k=%d" % cisomap.get_k())) - -from shogun import DiffusionMaps -from shogun import GaussianKernel -dm = DiffusionMaps() -dm.set_t(2) -dm.set_width(1000.0) -converters.append((dm,"Diffusion Maps with t=%d, sigma=%.1f" % (dm.get_t(),dm.get_width()))) - -from shogun import HessianLocallyLinearEmbedding -hlle = HessianLocallyLinearEmbedding() -hlle.set_k(6) -converters.append((hlle,"Hessian LLE with k=%d" % (hlle.get_k()))) - -from shogun import LocalTangentSpaceAlignment -ltsa = LocalTangentSpaceAlignment() -ltsa.set_k(6) -converters.append((ltsa,"LTSA with k=%d" % (ltsa.get_k()))) - -from shogun import LaplacianEigenmaps -le = LaplacianEigenmaps() -le.set_k(20) -le.set_tau(100.0) -converters.append((le,"Laplacian Eigenmaps with k=%d, tau=%d" % (le.get_k(),le.get_tau()))) - -import matplotlib -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D - -fig = plt.figure() - -new_mpl = False - -try: - swiss_roll_fig = fig.add_subplot(3,3,1, projection='3d') - new_mpl = True -except: - figure = plt.figure() - swiss_roll_fig = Axes3D(figure) - -swiss_roll_fig.scatter(X[0], X[1], X[2], s=10, c=tt, cmap=plt.cm.Spectral) -swiss_roll_fig._axis3don = False -plt.suptitle('Swissroll embedding',fontsize=9) -plt.subplots_adjust(hspace=0.4) - -from shogun import RealFeatures - -for (i, (converter, label)) in enumerate(converters): - X = numpy.genfromtxt('../../data/toy/swissroll.dat',unpack=True).T - features = RealFeatures(X) - converter.set_target_dim(2) - converter.parallel.set_num_threads(1) - new_feats = converter.embed(features).get_feature_matrix() - if not new_mpl: - embedding_subplot = fig.add_subplot(4,2,i+1) - else: - embedding_subplot = fig.add_subplot(3,3,i+2) - embedding_subplot.scatter(new_feats[0],new_feats[1], c=tt, cmap=plt.cm.Spectral) - plt.axis('tight') - plt.xticks([]), plt.yticks([]) - plt.title(label,fontsize=9) - print converter.get_name(), 'done' - -plt.show() diff --git a/applications/tapkee/words_embedding.py b/applications/tapkee/words_embedding.py deleted file mode 100644 index d74deb215e6..00000000000 --- a/applications/tapkee/words_embedding.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# Written (W) 2011 Sergey Lisitsyn -# Copyright (C) 2011 Sergey Lisitsyn - -from numpy import * -from pylab import * -from shogun import * -import random -import difflib - -def word_kernel(words): - N = len(words) - dist_matrix = zeros([N,N]) - for i in range(N): - for j in range(i,N): - s = difflib.SequenceMatcher(None,words[i],words[j]) - dist_matrix[i,j] = s.ratio() - dist_matrix = 0.5*(dist_matrix+dist_matrix.T) - return CustomKernel(dist_matrix) - -print 'loading' -words = [] -f = open("../../data/toy/words.dat") -for line in f: - words.append(line[:-1]) -f.close() -print 'loaded' - -converter = KernelLocallyLinearEmbedding() -converter.set_k(10) -converter.set_target_dim(2) -converter.parallel.set_num_threads(1) -embedding = converter.embed_kernel(word_kernel(words[:200])) -embedding_matrix = embedding.get_feature_matrix() -fig = figure() -ax = fig.add_subplot(1,1,1) -ax.scatter(embedding_matrix[0,:],embedding_matrix[1,:],alpha=0.4,cmap=cm.Spectral,c=embedding_matrix[0,:]*embedding_matrix[1,:]) - -# hardcode ;) -words_to_show = ['finishing','publishing','standing',\ - 'shifted','insisted','tilted','blasted',\ - 'jumble','battle','gobble'] - -for i in xrange(0,200): - if words[i] in words_to_show: - ax.text(embedding_matrix[0,i]*1.1,1.25*embedding_matrix[1,i],words[i],fontsize=16,alpha=1.0) - -axis('off') -show() - diff --git a/benchmarks/hasheddoc_benchmarks.cpp b/benchmarks/hasheddoc_benchmarks.cpp deleted file mode 100644 index adc268c3b35..00000000000 --- a/benchmarks/hasheddoc_benchmarks.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Evangelos Anagnostopoulos - */ - -#include -#include -#include -#include -#include -#include - -using namespace shogun; - -int main(int argv, char** argc) -{ - init_shogun_with_defaults(); - - int32_t bits[] = {8, 10, 12, 16, 20}; - int32_t bits_length = 5; - - int32_t num_strings = 5000; - int32_t max_str_length = 10000; - SGStringList string_list(num_strings, max_str_length); - - SG_SPRINT("Creating features...\n"); - for (index_t i=0; i(max_str_length); - for (index_t j=0; j* string_feats = new CStringFeatures(string_list, RAWBYTE); - CNGramTokenizer* tzer = new CNGramTokenizer(3); - - for (index_t i=0; ibenchmark_dense_dot_range(); - feats->benchmark_add_to_dense_vector(); - } - exit_shogun(); -} diff --git a/benchmarks/kernel_matrix_sum_benchmark.cpp b/benchmarks/kernel_matrix_sum_benchmark.cpp deleted file mode 100644 index 62d7228480d..00000000000 --- a/benchmarks/kernel_matrix_sum_benchmark.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace shogun; -using namespace Eigen; - -std::pair test() -{ - CTime *time=new CTime(); - - const index_t n=1000; - const index_t d=3; - SGMatrix data_p(d, n); - Map data_pm(data_p.matrix, data_p.num_rows, data_p.num_cols); - data_pm=MatrixXd::Random(d, n); - SGMatrix data_q(d, n); - Map data_qm(data_q.matrix, data_q.num_rows, data_q.num_cols); - data_qm=MatrixXd::Random(d, n); - - CDenseFeatures* feats_p=new CDenseFeatures(data_p); - CDenseFeatures* feats_q=new CDenseFeatures(data_q); - CGaussianKernel* kernel=new CGaussianKernel(feats_p, feats_q, 2); - CCustomKernel* precomputed_kernel=new CCustomKernel(kernel); - - // BENCHMARK_1 - time->start(); - float64_t sum1=precomputed_kernel->sum_block(0, 0, n, n); - float64_t time1=time->cur_time_diff(); - - float64_t sum2=0.0; - SGMatrix km=precomputed_kernel->get_kernel_matrix(); - Map k_m(km.matrix, km.num_rows, km.num_cols); - - // BENCHMARK_2 - time->start(); - sum2=k_m.sum(); - float64_t time2=time->cur_time_diff(); - - ASSERT(CMath::abs(sum1-sum2) <= 1E-5); - - SG_UNREF(kernel); - SG_UNREF(precomputed_kernel); - SG_UNREF(time); - - return std::make_pair(time1, time2); -} - -int main(int argc, char **argv) -{ - init_shogun_with_defaults(); - //sg_io->set_loglevel(MSG_DEBUG); - //sg_io->set_location_info(MSG_FUNCTION); - float64_t time1=0.0, time2=0.0; - float64_t var1=0.0, var2=0.0; - index_t num_runs=100; - for (index_t i=1; i<=num_runs; ++i) - { - std::pair time=test(); - float64_t delta=time.first - time1; - time1+=delta/i; - var1+=delta*(time.first - time1); - delta=time.second - time2; - time2+=delta/i; - var2+=delta*(time.second - time2); - } - var1/=num_runs; - var2/=num_runs; - SG_SPRINT("mean %f\t var %f\n", time1, var1); - SG_SPRINT("mean %f\t var %f\n", time2, var2); - exit_shogun(); - return 0; -} - diff --git a/benchmarks/rf_feats_benchmark.cpp b/benchmarks/rf_feats_benchmark.cpp deleted file mode 100644 index e93d72a8459..00000000000 --- a/benchmarks/rf_feats_benchmark.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include -#include -#include -#include -#include - -using namespace shogun; - -/** Benchmark code for the RandomFourierDotFeatures class - * Current results are after the code - */ - -int main(int argv, char** argc) -{ - init_shogun_with_defaults(); - - int32_t dims[] = {100, 300, 600}; - CTime* timer = new CTime(); - for (index_t d=0; d<3; d++) - { - int32_t num_dim = dims[d]; - int32_t num_vecs = 100000; - SG_SPRINT("-------------------------------------------------------------------------\n"); - SG_SPRINT("Starting experiment for number of dimensions = %d, number of vectors = %d,", num_dim, num_vecs); - SGMatrix mat(num_dim, num_vecs); - for (index_t i=0; i params(1); - params[0] = num_dim - 20; - SG_SPRINT(" using kernel_width = %f\n", params[0]); - - CDenseFeatures* dense_feats = new CDenseFeatures(mat); - SG_REF(dense_feats); - - int D[] = {50, 100, 200, 300, 400, 500}; - for (index_t i=0; i<6; i++) - { - SG_SPRINT("Results for D = %d\n", D[i]); - CRandomFourierDotFeatures* rand_feats = - new CRandomFourierDotFeatures(dense_feats, D[i], KernelName::GAUSSIAN, params); - rand_feats->benchmark_dense_dot_range(); - rand_feats->benchmark_add_to_dense_vector(); - SG_UNREF(rand_feats); - } - - SG_SPRINT("-------------------------------------------------------------------------\n"); - SG_UNREF(dense_feats); - } - SG_SPRINT("Total time : %fs\n", timer->cur_runtime_diff_sec()); - timer->stop(); - SG_UNREF(timer); - - exit_shogun(); -} - -/** Current results, using Release settings, for future comparisons : - * ------------------------------------------------------------------------- - * Starting experiment for number of dimensions = 100, number of vectors = 100000, using kernel_width = 80.000000 - * Results for D = 50 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 1.846000s walltime 0.310587s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 1.244000s walltime 1.244486s - * Results for D = 100 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 3.438000s walltime 0.521576s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 2.644000s walltime 2.645543s - * Results for D = 200 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 5.860000s walltime 0.867629s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 5.092000s walltime 5.090811s - * Results for D = 300 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 8.564000s walltime 1.233921s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 7.770000s walltime 7.770405s - * Results for D = 400 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 10.974000s walltime 1.531718s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 10.126000s walltime 10.125524s - * Results for D = 500 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 13.558000s walltime 1.965116s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 12.894000s walltime 12.894182s - * ------------------------------------------------------------------------- - * ------------------------------------------------------------------------- - * Starting experiment for number of dimensions = 300, number of vectors = 100000, using kernel_width = 280.000000 - * Results for D = 50 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 3.346000s walltime 0.580631s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 2.234000s walltime 2.234459s - * Results for D = 100 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 5.670000s walltime 0.878700s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 4.402000s walltime 4.401725s - * Results for D = 200 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 10.044000s walltime 1.441796s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 9.332000s walltime 9.332423s - * Results for D = 300 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 15.382000s walltime 2.138093s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 14.858000s walltime 14.858871s - * Results for D = 400 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 20.674000s walltime 2.905396s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 20.028000s walltime 20.030157s - * Results for D = 500 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 25.662000s walltime 3.550897s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 24.374000s walltime 24.374596s - * ------------------------------------------------------------------------- - * ------------------------------------------------------------------------- - * Starting experiment for number of dimensions = 600, number of vectors = 100000, using kernel_width = 580.000000 - * Results for D = 50 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 4.414000s walltime 0.657778s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 3.490000s walltime 3.489634s - * Results for D = 100 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 8.456000s walltime 1.267112s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 7.458000s walltime 7.457174s - * Results for D = 200 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 16.922000s walltime 2.268248s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 16.142000s walltime 16.141996s - * Results for D = 300 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 25.584000s walltime 3.424675s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 25.752000s walltime 25.753305s - * Results for D = 400 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 34.392000s walltime 4.644195s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 34.340000s walltime 34.340004s - * Results for D = 500 - * Time to process 5 x num=100000 dense_dot_range ops: cputime 44.028000s walltime 5.816031s - * Time to process 5 x num=100000 add_to_dense_vector ops: cputime 43.978000s walltime 43.979196s - * ------------------------------------------------------------------------- - * Total time : 2531.890000s - */ diff --git a/benchmarks/rf_feats_kernel_comp.cpp b/benchmarks/rf_feats_kernel_comp.cpp deleted file mode 100644 index a1acc106d6f..00000000000 --- a/benchmarks/rf_feats_kernel_comp.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -using namespace shogun; - -/** Code that compares the times needed to train - * a linear svm using the RandomFourierDotFeatures class - * vs a non-linear svm using the Gaussian Kernel. - */ -int main(int argv, char** argc) -{ - init_shogun_with_defaults(); - - int32_t dims[] = {10, 100, 1000}; - int32_t vecs[] = {10000, 100000, 1000000}; - CTime* timer = new CTime(false); - float64_t epsilon = 0.001; - float64_t lin_C = 0.1; - float64_t non_lin_C = 0.1; - CPRCEvaluation* evaluator = new CPRCEvaluation(); - CSqrtDiagKernelNormalizer* normalizer = new CSqrtDiagKernelNormalizer(true); - SG_REF(normalizer); - for (index_t d=0; d<4; d++) - { - int32_t num_dim = dims[d]; - SG_SPRINT("Starting experiment for number of dimensions = %d\n", num_dim); - for (index_t v=0; v<3; v++) - { - int32_t num_vecs = vecs[v]; - SG_SPRINT(" Using %d examples\n", num_vecs); - SGMatrix mat(num_dim, num_vecs); - SGVector labs(num_vecs); - for (index_t i=0; i params(1); - params[0] = 8; - SG_SPRINT(" Using kernel_width = %f\n", params[0]); - - CDenseFeatures* dense_feats = new CDenseFeatures(mat); - SG_REF(dense_feats); - - CBinaryLabels* labels = new CBinaryLabels(labs); - SG_REF(labels); - - /** LibLinear SVM using RandomFourierDotFeatures */ - int32_t D[] = {50, 100, 300, 1000}; - for (index_t d=0; d<4; d++) - { - CRandomFourierDotFeatures* r_feats = new CRandomFourierDotFeatures( - dense_feats, D[d], KernelName::GAUSSIAN, params); - - //CLibLinear* lin_svm = new CLibLinear(C, r_feats, labels); - CSVMOcas* lin_svm = new CSVMOcas(lin_C, r_feats, labels); - lin_svm->set_epsilon(epsilon); - clock_t t = clock(); - timer->start(); - lin_svm->train(); - t = clock() - t; - timer->stop(); - SG_SPRINT("\tSVMOcas using RFDotFeatures(D=%d) finished training. Took %fs (or %fs), ", - D[d], timer->time_diff_sec(), (float64_t) t /CLOCKS_PER_SEC); - - t = clock(); - timer->start(); - CBinaryLabels* predicted = CLabelsFactory::to_binary(lin_svm->apply()); - timer->stop(); - t = clock() - t; - float64_t auPRC = evaluator->evaluate(predicted, labels); - SG_SPRINT("SVMOcas auPRC=%f (Applying took %fs (%fs)\n", auPRC, - timer->time_diff_sec(), (float64_t) t / CLOCKS_PER_SEC); - SG_UNREF(lin_svm); - SG_UNREF(predicted); - } - /** End of LibLinear code */ - - - /** LibSVM using Gaussian Kernel */ - - CGaussianKernel* kernel = new CGaussianKernel(dense_feats, dense_feats, params[0]); - //kernel->set_normalizer(normalizer); - CLibSVM* svm = new CLibSVM(non_lin_C, kernel, labels); - svm->set_epsilon(epsilon); - clock_t t = clock(); - timer->start(); - svm->train(); - t = clock() - t; - timer->stop(); - SG_SPRINT("\tLibSVM using GaussianKernel finished training. Took %fs (or %fs), ", - timer->time_diff_sec(), (float64_t) t /CLOCKS_PER_SEC); - - t = clock(); - timer->start(); - CBinaryLabels* predicted = CLabelsFactory::to_binary(svm->apply()); - timer->stop(); - t = clock() - t; - float64_t auPRC = evaluator->evaluate(predicted, labels); - SG_SPRINT("LibSVM auPRC=%f (Applying took %fs (%fs)\n", auPRC, - timer->time_diff_sec(), (float64_t) t / CLOCKS_PER_SEC); - SG_UNREF(svm); - SG_UNREF(predicted); - /** End of LibSVM code */ - SG_UNREF(labels); - SG_UNREF(dense_feats); - } - } - SG_UNREF(timer); - SG_UNREF(evaluator); - SG_UNREF(normalizer); - exit_shogun(); -} diff --git a/benchmarks/sparse_test.cpp b/benchmarks/sparse_test.cpp deleted file mode 100644 index 8849ea68e69..00000000000 --- a/benchmarks/sparse_test.cpp +++ /dev/null @@ -1,206 +0,0 @@ -/* - * This software is distributed under BSD 3-clause license (see LICENSE file). - * - * Authors: Soeren Sonnenburg, Pan Deng, Soumyajit De, Björn Esser - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -using namespace shogun; -using namespace Eigen; - -struct APPLY_THREAD_PARAM -{ - int32_t start; - int32_t stop; - float64_t* result; - float64_t* vec; - int32_t len; - SGSparseVector* sm; -}; - - -int32_t get_nnz(SGSparseMatrix m) -{ - - int32_t nnz=0; - int32_t n=m.num_vectors; - - for (int i=0; iresult; - SGSparseVector* m=par->sm; - float64_t* vec = par->vec; - int32_t len = par->len; - int32_t start = par->start; - int32_t stop = par->stop; - - for (index_t i=start; i sg_m_apply(SGSparseMatrix m, SGVector v) -{ - SGVector r(v.vlen); - ASSERT(v.vlen==m.num_vectors); - - int num_threads=8; - pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); - APPLY_THREAD_PARAM* params = SG_MALLOC(APPLY_THREAD_PARAM, num_threads); - int32_t step= m.num_vectors/num_threads; - - int32_t start=0; - int32_t stop=m.num_vectors; - int32_t t; - - for (t=0; tset_loglevel(MSG_GCDEBUG); - - const index_t n=100; - const index_t times=5; - const index_t size=1000000; - SGVector v(size); - v.set_const(1.0); - Map map_v(v.vector, v.vlen); - CTime time; - CMath::init_random(17); - - SG_SPRINT("time\tshogun (s)\teigen3 (s)\n\n"); - for (index_t t=0; t sg_m(size, size); - typedef SGSparseVectorEntry Entry; - SGSparseVector *vec=SG_MALLOC(SGSparseVector, size); - - // for first row - Entry *first=SG_MALLOC(Entry, size); - // the digonal index for row #1 - first[0].feat_index=0; - first[0].entry=1.836593; - for (index_t i=1; inum) - { - //// the diagonal element - rest[i][num-1].feat_index=i+1; - rest[i][num-1].entry=1.836593; - } - - vec[i+1].features=rest[i]; - vec[i+1].num_feat_entries=num; - - sg_m[i+1]=vec[i+1].get(); - } - SGVector r(size); - - SG_SPRINT("nnz=%d\n", get_nnz(sg_m)); - - // sg starts - time.start(); - for (index_t i=0; i map_r(r.vector, r.vlen); - float64_t sg_norm=map_r.norm(); - -//#endif // RUN_SHOGUN - -//#ifdef RUN_EIGEN - const SparseMatrix &eig_m=EigenSparseUtil::toEigenSparse(sg_m); - VectorXd eig_r(size); - - // eigen3 starts - time.start(); - for (index_t i=0; ieig_time); - ASSERT(CMath::abs(sg_norm-eig_norm)<=CMath::MACHINE_EPSILON) - - SG_FREE(vec); - SG_FREE(rest); - } - - - exit_shogun(); - - return 0; -}