Permalink
Browse files

Initial commit

  • Loading branch information...
0 parents commit d0a9e9a22fc3e9f3a3fa83c7786886a7e4e1ae51 @qpB- committed Dec 22, 2010
@@ -0,0 +1,12 @@
+.bundle
+db/*.sqlite3*
+log/*.log
+*.log
+tmp/**/*
+tmp/*
+doc/api
+doc/app
+*.swp
+*~
+.DS_Store
+
@@ -0,0 +1,3 @@
+.*.swp
+code
+*.db
@@ -0,0 +1,35 @@
+york, from, gaza, hudson, articles, that
+6 Crashed jet raised from Hudson River - Reuters
+5 US Airways Jet Lifted From Hudson - Wall Street Journal
+5 Questions and answers on Gaza's next steps - International Herald Tribune
+
+
+
+city, circuit, times, articles, angeles, bloomberg
+11 Bedlam breaks out at Circuit City - CNET News
+2 US-ENTERTAINMENT Summary - Washington Post
+2 Spagnuolo Agrees to Four-Year Contract as Rams’ Coach - Bloomberg
+
+
+
+quot, treasury, absolutely, geithner, despite, aide
+10 Obama backs Treasury pick "absolutely": aide - Reuters
+3 Obama backs Treasury pick "absolutely": aide
+3 'This Week' Transcript: David Axelrod - ABC News
+
+
+
+peanut, butter, little, debbie, crackers, salmonella
+11 Little Debbie peanut butter crackers recalled - The Associated Press
+4 Little Debbie peanut butter crackers recalled
+4 FDA: Put off eating peanut butter products
+
+
+
+deal, year, hamels, phillies, philadelphia, press
+11 Hamels, Phillies agree on 3-year deal, $20.5M deal - The Associated Press
+4 Price details unclear in murky Russia-Ukraine deal - guardian.co.uk
+2 Spagnuolo Agrees to Four-Year Contract as Rams’ Coach - Bloomberg
+
+
+
@@ -0,0 +1,142 @@
+require 'rubygems'
+require 'feed-normalizer'
+require 'open-uri'
+require 'ruby-debug'
+require 'gsl'
+include GSL
+
+FEEDLIST= [
+ 'http://feeds.reuters.com/reuters/topNews',
+ 'http://feeds.reuters.com/reuters/domesticNews',
+ 'http://feeds.reuters.com/reuters/worldNews',
+ # 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=NCWIN&SECTION=HOME',
+ 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=OKPON&SECTION=HOME',
+ # 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=KLIF&SECTION=HOME',
+ 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=SCGRE&SECTION=HOME',
+ # 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
+ # 'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
+ 'http://news.google.com/?output=rss',
+ # 'http://feeds.salon.com/salon/news',
+ # 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
+ # 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
+ # 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
+ # 'http://rss.cnn.com/rss/edition.rss',
+ # 'http://rss.cnn.com/rss/edition_world.rss',
+ 'http://rss.cnn.com/rss/edition_us.rss',
+ ]
+
+def stripHTML(h)
+ p=''
+ s=0
+ for c in h.split(//)
+ if c=='<'
+ s=1
+ elsif c=='>'
+ s=0
+ p += ' '
+ elsif s==0
+ p+=c
+ end
+ end
+ p
+end
+
+def separatewords(text)
+ text.scan(/\w*/).select{|s| s.length>3}.map{|s| s.downcase}
+end
+
+def getarticlewords
+ allwords={}
+ articlewords=[]
+ articletitles=[]
+ ec=0
+ # Loop over every feed
+ for feed in FEEDLIST
+ f=FeedNormalizer::FeedNormalizer.parse open(feed)
+
+ # Loop over every article
+ i=0
+ for e in f.entries
+ i += 1
+ # Ignore identical articles
+ next if articletitles.include?(e.title)
+
+ # Extract the words
+ txt = e.title + stripHTML(e.content)
+ words = separatewords(txt)
+ articlewords << {}
+ articletitles << e.title
+
+ # Increase the counts for this word in allwords and in articlewords
+ for word in words
+ allwords[word] ||= 0
+ allwords[word] += 1
+ articlewords[ec][word] ||= 0
+ articlewords[ec][word] += 1
+ end
+ ec += 1
+ end
+ end
+ [allwords, articlewords, articletitles]
+end
+
+def makematrix(allw, articlew)
+ wordvec = []
+
+ # Only take words that are common but not too common
+ for w,c in allw
+ if c > 3 and c < articlew.length*0.6
+ wordvec << w
+ end
+ end
+
+ # Create the word matrix
+ l1 = articlew.map{|f| wordvec.map{|word| (f.include?(word) && f[word]) || 0}}
+ return [l1, wordvec]
+end
+
+def showfeatures(w,h,titles,wordvec,out='features.txt')
+ File.open(out, 'w') do |outfile|
+ pc,wc=h.shape
+ toppatterns=[*0..titles.length].map{|i| []}
+ patternnames =[]
+
+ # Loop over all the features
+ pc.times do |i|
+ slist=[]
+
+ # Create a list of words and their weights
+ wc.times do |j|
+ slist << [h[i,j], wordvec[j]]
+ end
+
+ slist.sort!
+ slist.reverse!
+
+ # Print the first six elements
+ n = slist[0..5].map{|s| s[1]}
+ outfile << n.join(", ") + "\n"
+ patternnames << n
+
+ # Create a list of articles for this feature
+ flist = []
+ titles.length.times do |j|
+ # Add the article with its weight
+ flist << [w[j,i].round.to_i, titles[j]]
+ toppatterns[j] << [w[j,i], i, titles[j]]
+ end
+
+ # Reverse sort the list
+ flist.sort!
+ flist.reverse!
+
+ flist[0..2].each do |f|
+ outfile << f.join("\t\t") + "\n"
+ end
+ outfile << "\n\n\n"
+
+ end # pc.times
+ [toppatterns, patternnames]
+ end # File.open
+end
+
@@ -0,0 +1,35 @@
+require 'rubygems'
+require 'spec'
+require 'newsfeatures'
+require 'nnmf'
+
+describe "stripping html" do
+ it "should strip html" do
+ stripHTML("<a>foo</a>bar<strong><a>baz</a></strong>").should == ' foo bar baz '
+ end
+end
+
+describe "separating words" do
+ it "shoudl separate words" do
+ separatewords("hello I am a bunch. of, words").should == ['hello', 'bunch', 'words']
+ end
+end
+
+describe "dealing with matrices" do
+
+ it "should provide two matrices that when multiplied are nearly equal to the original matrix" do
+ @m1 = DMatrix[[1,2,3],[4,5,6]]
+ @m2 = DMatrix[[1,2],[3,4],[5,6]]
+ w,h = factorize(@m1*@m2, 3,100)
+ (w * h).to_a.map{|e| e.map{|ei| ei.round}}.should == (@m1 * @m2).to_a
+ end
+
+end
+
+allw,artw,artt= getarticlewords
+wordmatrix,wordvec= makematrix(allw,artw)
+v=DMatrix[*wordmatrix]
+
+weights,feat = factorize(v,pc=20,iter=50)
+topp,pn= showfeatures(weights,feat,artt,wordvec)
+
@@ -0,0 +1,56 @@
+require 'linalg'
+include Linalg
+load '../array_math.rb'
+
+class Linalg::DMatrix
+ def shape
+ [vsize,hsize]
+ end
+end
+
+def difcost(a,b)
+ dif=0
+ a.shape[0].times do |i|
+ a.shape[1].times do |j|
+ # Euclidean Distance
+ dif += (a[i,j]-b[i,j]) ** 2
+ end
+ end
+ dif
+end
+
+
+def factorize(v, pc=10, iter=50)
+ ic = v.shape[0]
+ fc = v.shape[1]
+
+ # Initialize the weight and feature matrices with random values
+ w = DMatrix[*[*0..ic-1].map{|i| [*0..pc-1].map{|j| rand}}]
+ h = DMatrix[*[*0..pc-1].map{|i| [*0..fc-1].map{|j| rand}}]
+
+ iter.times do |i|
+ wh = w * h
+
+ # Calculate the current difference
+ cost = difcost(v,wh)
+
+ puts cost if i % 10 == 0
+
+ # Terminate if the matrix has been fully factorized
+ break if cost == 0
+
+ # Update feature matrix
+ hn = w.transpose * v
+ hd = w.transpose * w * h
+
+ h = DMatrix[*(h.to_a * hn.to_a / hd.to_a)]
+
+ # Update weights matrix
+ wn = v * h.transpose
+ wd = w * h * h.transpose
+
+ w = DMatrix[*(w.to_a * wn.to_a / wd.to_a)]
+ end
+
+ [w, h]
+end
@@ -0,0 +1,27 @@
+class Array
+ def *(other)
+ r = []
+ self.each_with_index do |e, i|
+ r[i] = (e * other[i])
+ end
+ r
+ end
+
+ def /(other)
+ r = []
+ self.each_with_index do |e, i|
+ r[i] = (e / other[i])
+ end
+ r
+ end
+end
+
+
+describe "multiplying arrays" do
+
+ it "should do simple stuff" do
+ ([1,2] * [3,4]).should == [3,8]
+ end
+
+end
+
Oops, something went wrong. Retry.

0 comments on commit d0a9e9a

Please sign in to comment.