Updated plugin for Firefox 3.6. Replaced spaces with underscores in c…

…laim file names. Lucene indexing now removes stopwords.
robennals · Jan 23, 2010 · acb83f6 · acb83f6
1 parent f7f1f7a
commit acb83f6
Show file tree

Hide file tree

Showing 8 changed files with 218 additions and 38 deletions.
diff --git a/firefox_plugin/chrome/content/main.xul b/firefox_plugin/chrome/content/main.xul
@@ -34,7 +34,7 @@
 
 	<popup id="contentAreaContextMenu" insertafter="context-searchselect">
 		<menuseparator/>
-		<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb_red.png" label="Selected text makes a disputed claim" id="thinklink_menu_newsnip" oncommand="thinklink_is_disputed()"/>
+		<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb_red.png" label="Text like this should be highlighted as disputed" id="thinklink_menu_newsnip" oncommand="thinklink_is_disputed()"/>
 		<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb.png" label="Selected text supports a claim" id="thinklink_menu_newsnip" oncommand="thinklink_new_snippet(false,true)"/>
 		<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb.png" label="Selected text opposes a claim" id="thinklink_menu_newsnip" oncommand="thinklink_new_snippet(false,false)"/>
 	</popup>

diff --git a/firefox_plugin/chrome/content/thinklink.js b/firefox_plugin/chrome/content/thinklink.js
@@ -28,7 +28,12 @@ function thinklink_msg(msg){
 }
 
 function thinklink_is_disputed(){
-	window.open("http://disputefinder.cs.berkeley.edu/pages/claims.html");
+	var text = content.document.getSelection();
+	if(!text){
+		window.open("http://disputefinder.cs.berkeley.edu/pages/claims.html");
+	}else{
+		window.open("http://disputefinder.cs.berkeley.edu/thinklink/search?query="+encodeURIComponent(text));
+	}
 }
 
 function thinklink_new_snippet(isdisputed,supports){

diff --git a/firefox_plugin/install.rdf b/firefox_plugin/install.rdf
@@ -4,14 +4,14 @@ xmlns:em="http://www.mozilla.org/2004/em-rdf#">
 
     <Description about="urn:mozilla:install-manifest">
         <em:id>thinklink@intel.com</em:id>
-        <em:version>0.36</em:version>
+        <em:version>0.37</em:version>
         <em:type>2</em:type>
 
         <em:targetApplication>
 	        <Description>
 	        <em:id>{ec8030f7-c20a-464f-9b0e-13a3a9e97384}</em:id>
-	        <em:minVersion>1.5</em:minVersion>
-	        <em:maxVersion>3.5.*</em:maxVersion>
+	        <em:minVersion>3.5</em:minVersion>
+	        <em:maxVersion>3.6.*</em:maxVersion>
 	        </Description>
         </em:targetApplication>
 

diff --git a/package/install.rdf b/package/install.rdf
@@ -4,14 +4,14 @@ xmlns:em="http://www.mozilla.org/2004/em-rdf#">
 
     <Description about="urn:mozilla:install-manifest">
         <em:id>thinklink@intel.com</em:id>
-        <em:version>0.36</em:version>
+        <em:version>0.37</em:version>
         <em:type>2</em:type>
 
         <em:targetApplication>
 	        <Description>
 	        <em:id>{ec8030f7-c20a-464f-9b0e-13a3a9e97384}</em:id>
-	        <em:minVersion>1.5</em:minVersion>
-	        <em:maxVersion>3.5.*</em:maxVersion>
+	        <em:minVersion>3.5</em:minVersion>
+	        <em:maxVersion>3.6.*</em:maxVersion>
 	        </Description>
         </em:targetApplication>
 

diff --git a/scala/src/com/intel/thinkscala/claimfinder/ClaimFinder.scala b/scala/src/com/intel/thinkscala/claimfinder/ClaimFinder.scala
@@ -1,12 +1,11 @@
 package com.intel.thinkscala.claimfinder
-import scala.xml.NodeSeq
-import scala.xml.Node
+import scala.xml.{Node,NodeSeq}
 import scala.xml.parsing._
 import scala.io._
 import java.io._
 import com.intel.thinkscala.Util._
-import scala.collection.mutable.ListBuffer
-import scala.runtime.NonLocalReturnException
+import collection.mutable.ListBuffer
+import runtime.NonLocalReturnException
 
 object ClaimFinder {
 	val bossKey = "NpeiOwLV34E5KHWPTxBix1HTRHe4zIj2LfTtyyDKvBdeQHOzlC_RIv4SmAPuBh3E";
@@ -83,7 +82,7 @@ object ClaimFinder {
 	}	
 
 	def urlFileForPhraseDate(phrase : String, date : String){
-		val filename = new File(basepath+"/urlphrases_date/"+date+"/"+phrase.replace(" ","_")+".urls")
+		val filename = new File(basepath+"/urlphrases_date/"+date.replace(" ","_")+"/"+phrase.replace(" ","_")+".urls")
 		filename.getParentFile.mkdirs()
 		if(filename.exists) return
 		val writer = new PrintWriter(new FileWriter(filename))
@@ -92,6 +91,12 @@ object ClaimFinder {
 		writer.close		
 	}
 
+	def getUrlsForDateRange(year : Int, month : String, daystart : Int, dayend : Int){
+		for(day <- daystart until dayend){
+			getUrlsForAllPhrasesDate(month + " " + day + " " + year)
+		}
+	}
+
 	def getUrlsForAllPhrasesDate(date : String){
 		phrases_that.foreach{phrase => 
 			System.out.print("getting urls for phrase: "+phrase+"...")
@@ -131,6 +136,7 @@ object ClaimFinder {
 			"the deception that",
 			"the misunderstanding that",
 			"false claim that",
+			"false claim is that",
 			"mistakenly believe that",
 			"mistaken belief that",
 			"the absurd idea that",
@@ -147,13 +153,17 @@ object ClaimFinder {
 			"urban legend that",
 			"the fantasy that",
 			"incorrectly claim that",
+			"incorrectly claimed that",
 			"incorrectly believe that",
 			"stupidly believe that",
 			"falsely believe that",
 			"wrongly believe that",
 			"falsely suggests that",
 			"falsely claims that",
-			"falsely stated that"
+			"falsely stated that",
+			"absurdity of the claim that",
+			"false ad claiming that",
+			"crazies who believe that"
 			)
 
 	 val phrases_nothat = phrases_that map (phrase => phrase.replace(" that",""))

diff --git a/scala/src/com/intel/thinkscala/claimfinder/ExtractClaims.scala b/scala/src/com/intel/thinkscala/claimfinder/ExtractClaims.scala
@@ -3,6 +3,8 @@ package com.intel.thinkscala.claimfinder
 import java.io._
 import java.net._
 import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.HashMap
+import scala.collection.mutable.HashSet
 import com.intel.thinkscala.Util._
 import com.intel.thinkscala.util.Dataflow._
 import com.intel.thinkscala.util.TabData
@@ -29,13 +31,23 @@ object ExtractClaims {
 		val html = downloadUrlStart(url).toLowerCase
 		val content = htmlToSentences(html)
 		val title = getTitle(html)
-		ClaimFinder.phrases_nothat.foreach{prefix => 
+		ClaimFinder.phrases_that.foreach{prefix => 
 			val phrase_claims = findPrefix(content,prefix,url,title)
 			claims.appendAll(phrase_claims)
 		}
-		claims
+		removeDuplicates(claims)
 	}
 
+	def removeDuplicates(claims : Seq[UrlClaim]) : Seq[UrlClaim] = {
+		val map = new HashMap[String,UrlClaim]
+		claims foreach {x =>
+			if(!map.isDefinedAt(x.claim)){
+				map(x.claim) = x
+			}
+		}
+		map.valuesIterator.toList
+	}
+
 	def extractAllClaims(infile : String,outfile : String) = 
 		mapFile(infile,outfile,extractClaimsFromUrl)
 
@@ -49,7 +61,7 @@ object ExtractClaims {
 			val end = content.indexOf('.',start+prefix.length+1)
 			val statement = content.substring(start+prefix.length, end)
 			val context = trimPartWords(fuzzySubstring(content,start-500,start+500))
-			claims.append(new UrlClaim(url,title,statement,context))
+			claims.append(new UrlClaim(url,title,normalizeString(statement),context))
 			start = content.indexOf(prefix,start+1)
 		}
 		claims

diff --git a/scala/src/com/intel/thinkscala/claimfinder/LuceneIndex.scala b/scala/src/com/intel/thinkscala/claimfinder/LuceneIndex.scala
@@ -1,11 +1,18 @@
 package com.intel.thinkscala.claimfinder
+import com.intel.thinkscala.Util._
 import org.apache.lucene.document._
 import org.apache.lucene.index._
 import org.apache.lucene.search._
 import org.apache.lucene.queryParser.QueryParser
 import org.apache.lucene.store.FSDirectory
 import org.apache.lucene.util.Version
 import org.apache.lucene.analysis.standard.StandardAnalyzer
+import org.apache.lucene.analysis.snowball.SnowballAnalyzer 
+import org.apache.lucene.analysis.TokenStream
+import org.apache.lucene.analysis.StopAnalyzer
+import org.apache.lucene.analysis.tokenattributes._
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.ListBuffer
 import java.io._
 import scala.io.Source
 
@@ -16,46 +23,194 @@ object LuceneIndex {
 		doc
 	}
 
+	def addDocForFullLine(writer : IndexWriter, line : String){
+		val cols = line.split("\t")
+		if(cols.length != 4) return
+		val doc = new Document
+		doc.add(new Field("url", cols(0), Field.Store.YES, Field.Index.NO))
+		doc.add(new Field("title", cols(1), Field.Store.YES, Field.Index.NO))
+		doc.add(new Field("claim", cols(2), Field.Store.YES, Field.Index.ANALYZED))
+		doc.add(new Field("contents", cols(3), Field.Store.YES, Field.Index.NO))
+		writer.addDocument(doc)
+	}
+
 	def main(args : Array[String]){
 		val infile = args(0)
 		val outfile = args(1)
 		val writer = new IndexWriter(FSDirectory.open(new File(outfile)),
-			new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED )
+			new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"), true, IndexWriter.MaxFieldLength.LIMITED )
 
 		Source.fromFile(new File(infile)).getLines("\n").foreach{line => 			
-			writer.addDocument(docForLine(line))
+			addDocForFullLine(writer,line)
+//			writer.addDocument(docForLine(line))
 		}
 
 		writer.optimize()
 		writer.close
 	}	
 }
 
+// how similar are these two phrases?
+// used for clustering, and for determining if something is disputed
+// ideally want to pick a good known paraphrase algorithm
+object PhraseCompare {
+	val anal = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English",stopWords)	
+
+	def stopWords : Array[String] = {
+		val set = StopAnalyzer.ENGLISH_STOP_WORDS_SET
+		val arr = new Array[String](set.size)
+		set.toArray(arr)
+		arr
+	}
+
+	// based on "A metric for paraphrase detection"
+	def similarityLCP(phrase : String, other : String) : Double = {
+		val phrasetokens = tokens(phrase)
+		val othertokens = tokens(other)
+		var bestscore = 0.0
+		for(i <- 1 to 4){
+			val phrasengrams = ngrams(phrasetokens,i)
+			val otherngrams = ngrams(othertokens,i)			
+			val overlap = phrasengrams.filter(ngram => otherngrams contains ngram)
+			val count_match = overlap.length.asInstanceOf[Double]
+			val count_total = phrasengrams.length + otherngrams.length - count_match
+			if(count_total > 0){
+				val score = count_match / count_total
+				bestscore = Math.max(bestscore,score)
+			}
+		}		
+		return bestscore
+	}	
+
+	// based on "A metric for paraphrase detection"
+	// seems to give pretty poor results
+	def similarityNgram(phrase : String, other : String) : Double = {
+		val phrasetokens = tokens(phrase)
+		val othertokens = tokens(other)
+		var sumscore = 0.0
+		val maxn = Math.min(4,phrasetokens.length)
+		for(i <- 1 to maxn){
+			val phrasengrams = ngrams(phrasetokens,i)
+			val otherngrams = ngrams(othertokens,i)			
+			val overlap = phrasengrams.filter(ngram => otherngrams contains ngram)
+			val count_match = overlap.length.asInstanceOf[Double]
+			val count_total = phrasengrams.length + otherngrams.length - count_match
+			val score = count_match / count_total
+			sumscore += score
+		}		
+		return sumscore / maxn
+	}
+
+	def similarityHasAll(phrase : String, other : String) : Boolean = {
+		val phrasetokens = tokens(phrase)
+		val othertokens = tokens(other)
+		!phrasetokens.exists(token => !othertokens.contains(token))
+	}
+
+	def similarityHasAllSameNeg(phrase : String, other : String) : Boolean 
+		= similarityHasAll(phrase,other) && sameNeg(phrase,other)
+
+	def isNeg(phrase : String) = words(phrase).contains("not") || phrase.contains("n't")	
+	def sameNeg(phrase : String, other : String) : Boolean = isNeg(phrase) == isNeg(other)
+
+	// based on "A word overlap baseline for the recognizing textual entailment task"
+	// TODO: remove stopwords
+	def similarityWordOverlap(phrase : String, other : String) : Double = {
+		val phrasetokens = tokens(phrase)
+		val othertokens = tokens(other)
+		val overlap = phrasetokens.filter(word => othertokens contains word)
+		val wordoverlap = overlap.length
+		val p = wordoverlap.asInstanceOf[Double] / phrasetokens.length
+		val r = wordoverlap.asInstanceOf[Double] / othertokens.length
+		if(p + r > 0){
+			(2.0 * p * r)/(p+r)
+		}else{
+			0.0
+		}
+	}
+
+	def isSimilar(phrase : String, other : String) = similarityHasAllSameNeg(phrase,other)
+
+	def words(phrase : String) : Seq[String] = phrase.split("\\s+")
+
+	def ngrams(words : Seq[String], n : Int) : Seq[String] = {
+		val ngrams = new ListBuffer[String]
+		for(i <- 0 to words.length - n){
+			ngrams += words.slice(i,i+n).mkString(" ")
+		}
+		ngrams
+	}
+
+	def tokens(phrase : String) : Seq[String] = {
+		val r = new StringReader(phrase.replace("n't",""))
+		val s = anal.tokenStream("contents",r)
+		val b = new ArrayBuffer[String]
+		while(s.incrementToken){
+			b += s.getAttribute(classOf[TermAttribute]).term
+		}
+		b
+	}
+}
+
 object LuceneSearch {
-	def main(args : Array[String]){
-		val indexfile = args(0)
+	def getResults(indexfile : String, query : String) : Array[Document] = {
 		val reader = IndexReader.open(FSDirectory.open(new File(indexfile)))
 		val searcher = new IndexSearcher(reader);
-		val analyser = new StandardAnalyzer(Version.LUCENE_CURRENT)
-		val parser = new QueryParser(Version.LUCENE_CURRENT, "contents", analyser)
+		val analyser = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English")
+		val parser = new QueryParser(Version.LUCENE_CURRENT, "claim", analyser)
 
+		val input = query.trim
+		val collector = TopScoreDocCollector.create(20, false)
+		searcher.search(parser.parse(input),collector)
+		val hits = collector.topDocs().scoreDocs
+		val results = hits.map {hit =>
+			searcher.doc(hit.doc)
+		}
+		reader.close
+		results		
+	}
+
+	def isDisputed(indexfile : String, text : String) : Boolean = {
+		val results = getResults(indexfile,text)
+		results exists (doc => PhraseCompare.isSimilar(doc.get("claim"),text))
+	}
+
+	def main(args : Array[String]){
 		val in = new BufferedReader(new InputStreamReader(System.in,"UTF-8"))
-
 		while(true){
 			System.out.println("Enter query:")
-			val input = in.readLine().trim
+			val input = in.readLine()
 			System.out.println("Searching for : "+input)
-			val query = parser.parse(input)
-
-			val collector = TopScoreDocCollector.create(10, false)
-			searcher.search(query,collector)
-			System.out.println(collector.getTotalHits + " results")
-			val hits = collector.topDocs().scoreDocs
-			hits foreach {hit =>
-				val doc = searcher.doc(hit.doc)
-				System.out.println(hit.score + " - " + doc.get("contents"))
+			val results = getResults(args(0),input)
+			results foreach {doc =>
+				System.out.println(doc.get("claim") + "\n\t" + domainForUrl(doc.get("url")) + " - "
+						+ doc.get("title"))
 			}
 		}
+//		
+//		val indexfile = args(0)
+//		val reader = IndexReader.open(FSDirectory.open(new File(indexfile)))
+//		val searcher = new IndexSearcher(reader);
+//		val analyser = new StandardAnalyzer(Version.LUCENE_CURRENT)
+//		val parser = new QueryParser(Version.LUCENE_CURRENT, "contents", analyser)
+//
+//		val in = new BufferedReader(new InputStreamReader(System.in,"UTF-8"))
+//		
+//		while(true){
+//			System.out.println("Enter query:")
+//			val input = in.readLine().trim
+//			System.out.println("Searching for : "+input)
+//			val query = parser.parse(input)
+//			
+//			val collector = TopScoreDocCollector.create(20, false)
+//			searcher.search(query,collector)
+//			System.out.println(collector.getTotalHits + " results")
+//			val hits = collector.topDocs().scoreDocs
+//			hits foreach {hit =>
+//				val doc = searcher.doc(hit.doc)
+//				System.out.println(hit.score + " - " + doc.get("contents"))
+//			}
+//		}
 	}
 
 }