Skip to content

Commit

Permalink
Updated plugin for Firefox 3.6. Replaced spaces with underscores in c…
Browse files Browse the repository at this point in the history
…laim file names. Lucene indexing now removes stopwords.
  • Loading branch information
Rob Ennals committed Jan 23, 2010
1 parent f7f1f7a commit acb83f6
Show file tree
Hide file tree
Showing 8 changed files with 218 additions and 38 deletions.
2 changes: 1 addition & 1 deletion firefox_plugin/chrome/content/main.xul
Expand Up @@ -34,7 +34,7 @@

<popup id="contentAreaContextMenu" insertafter="context-searchselect">
<menuseparator/>
<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb_red.png" label="Selected text makes a disputed claim" id="thinklink_menu_newsnip" oncommand="thinklink_is_disputed()"/>
<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb_red.png" label="Text like this should be highlighted as disputed" id="thinklink_menu_newsnip" oncommand="thinklink_is_disputed()"/>
<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb.png" label="Selected text supports a claim" id="thinklink_menu_newsnip" oncommand="thinklink_new_snippet(false,true)"/>
<menuitem class="menuitem-iconic" image="chrome://thinklink/skin/lightbulb.png" label="Selected text opposes a claim" id="thinklink_menu_newsnip" oncommand="thinklink_new_snippet(false,false)"/>
</popup>
Expand Down
7 changes: 6 additions & 1 deletion firefox_plugin/chrome/content/thinklink.js
Expand Up @@ -28,7 +28,12 @@ function thinklink_msg(msg){
}

function thinklink_is_disputed(){
window.open("http://disputefinder.cs.berkeley.edu/pages/claims.html");
var text = content.document.getSelection();
if(!text){
window.open("http://disputefinder.cs.berkeley.edu/pages/claims.html");
}else{
window.open("http://disputefinder.cs.berkeley.edu/thinklink/search?query="+encodeURIComponent(text));
}
}

function thinklink_new_snippet(isdisputed,supports){
Expand Down
6 changes: 3 additions & 3 deletions firefox_plugin/install.rdf
Expand Up @@ -4,14 +4,14 @@ xmlns:em="http://www.mozilla.org/2004/em-rdf#">

<Description about="urn:mozilla:install-manifest">
<em:id>thinklink@intel.com</em:id>
<em:version>0.36</em:version>
<em:version>0.37</em:version>
<em:type>2</em:type>

<em:targetApplication>
<Description>
<em:id>{ec8030f7-c20a-464f-9b0e-13a3a9e97384}</em:id>
<em:minVersion>1.5</em:minVersion>
<em:maxVersion>3.5.*</em:maxVersion>
<em:minVersion>3.5</em:minVersion>
<em:maxVersion>3.6.*</em:maxVersion>
</Description>
</em:targetApplication>

Expand Down
6 changes: 3 additions & 3 deletions package/install.rdf
Expand Up @@ -4,14 +4,14 @@ xmlns:em="http://www.mozilla.org/2004/em-rdf#">

<Description about="urn:mozilla:install-manifest">
<em:id>thinklink@intel.com</em:id>
<em:version>0.36</em:version>
<em:version>0.37</em:version>
<em:type>2</em:type>

<em:targetApplication>
<Description>
<em:id>{ec8030f7-c20a-464f-9b0e-13a3a9e97384}</em:id>
<em:minVersion>1.5</em:minVersion>
<em:maxVersion>3.5.*</em:maxVersion>
<em:minVersion>3.5</em:minVersion>
<em:maxVersion>3.6.*</em:maxVersion>
</Description>
</em:targetApplication>

Expand Down
22 changes: 16 additions & 6 deletions scala/src/com/intel/thinkscala/claimfinder/ClaimFinder.scala
@@ -1,12 +1,11 @@
package com.intel.thinkscala.claimfinder
import scala.xml.NodeSeq
import scala.xml.Node
import scala.xml.{Node,NodeSeq}
import scala.xml.parsing._
import scala.io._
import java.io._
import com.intel.thinkscala.Util._
import scala.collection.mutable.ListBuffer
import scala.runtime.NonLocalReturnException
import collection.mutable.ListBuffer
import runtime.NonLocalReturnException

object ClaimFinder {
val bossKey = "NpeiOwLV34E5KHWPTxBix1HTRHe4zIj2LfTtyyDKvBdeQHOzlC_RIv4SmAPuBh3E";
Expand Down Expand Up @@ -83,7 +82,7 @@ object ClaimFinder {
}

def urlFileForPhraseDate(phrase : String, date : String){
val filename = new File(basepath+"/urlphrases_date/"+date+"/"+phrase.replace(" ","_")+".urls")
val filename = new File(basepath+"/urlphrases_date/"+date.replace(" ","_")+"/"+phrase.replace(" ","_")+".urls")
filename.getParentFile.mkdirs()
if(filename.exists) return
val writer = new PrintWriter(new FileWriter(filename))
Expand All @@ -92,6 +91,12 @@ object ClaimFinder {
writer.close
}

def getUrlsForDateRange(year : Int, month : String, daystart : Int, dayend : Int){
for(day <- daystart until dayend){
getUrlsForAllPhrasesDate(month + " " + day + " " + year)
}
}

def getUrlsForAllPhrasesDate(date : String){
phrases_that.foreach{phrase =>
System.out.print("getting urls for phrase: "+phrase+"...")
Expand Down Expand Up @@ -131,6 +136,7 @@ object ClaimFinder {
"the deception that",
"the misunderstanding that",
"false claim that",
"false claim is that",
"mistakenly believe that",
"mistaken belief that",
"the absurd idea that",
Expand All @@ -147,13 +153,17 @@ object ClaimFinder {
"urban legend that",
"the fantasy that",
"incorrectly claim that",
"incorrectly claimed that",
"incorrectly believe that",
"stupidly believe that",
"falsely believe that",
"wrongly believe that",
"falsely suggests that",
"falsely claims that",
"falsely stated that"
"falsely stated that",
"absurdity of the claim that",
"false ad claiming that",
"crazies who believe that"
)

val phrases_nothat = phrases_that map (phrase => phrase.replace(" that",""))
Expand Down
18 changes: 15 additions & 3 deletions scala/src/com/intel/thinkscala/claimfinder/ExtractClaims.scala
Expand Up @@ -3,6 +3,8 @@ package com.intel.thinkscala.claimfinder
import java.io._
import java.net._
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet
import com.intel.thinkscala.Util._
import com.intel.thinkscala.util.Dataflow._
import com.intel.thinkscala.util.TabData
Expand All @@ -29,13 +31,23 @@ object ExtractClaims {
val html = downloadUrlStart(url).toLowerCase
val content = htmlToSentences(html)
val title = getTitle(html)
ClaimFinder.phrases_nothat.foreach{prefix =>
ClaimFinder.phrases_that.foreach{prefix =>
val phrase_claims = findPrefix(content,prefix,url,title)
claims.appendAll(phrase_claims)
}
claims
removeDuplicates(claims)
}

def removeDuplicates(claims : Seq[UrlClaim]) : Seq[UrlClaim] = {
val map = new HashMap[String,UrlClaim]
claims foreach {x =>
if(!map.isDefinedAt(x.claim)){
map(x.claim) = x
}
}
map.valuesIterator.toList
}

def extractAllClaims(infile : String,outfile : String) =
mapFile(infile,outfile,extractClaimsFromUrl)

Expand All @@ -49,7 +61,7 @@ object ExtractClaims {
val end = content.indexOf('.',start+prefix.length+1)
val statement = content.substring(start+prefix.length, end)
val context = trimPartWords(fuzzySubstring(content,start-500,start+500))
claims.append(new UrlClaim(url,title,statement,context))
claims.append(new UrlClaim(url,title,normalizeString(statement),context))
start = content.indexOf(prefix,start+1)
}
claims
Expand Down
189 changes: 172 additions & 17 deletions scala/src/com/intel/thinkscala/claimfinder/LuceneIndex.scala
@@ -1,11 +1,18 @@
package com.intel.thinkscala.claimfinder
import com.intel.thinkscala.Util._
import org.apache.lucene.document._
import org.apache.lucene.index._
import org.apache.lucene.search._
import org.apache.lucene.queryParser.QueryParser
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.util.Version
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.analysis.snowball.SnowballAnalyzer
import org.apache.lucene.analysis.TokenStream
import org.apache.lucene.analysis.StopAnalyzer
import org.apache.lucene.analysis.tokenattributes._
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.ListBuffer
import java.io._
import scala.io.Source

Expand All @@ -16,46 +23,194 @@ object LuceneIndex {
doc
}

def addDocForFullLine(writer : IndexWriter, line : String){
val cols = line.split("\t")
if(cols.length != 4) return
val doc = new Document
doc.add(new Field("url", cols(0), Field.Store.YES, Field.Index.NO))
doc.add(new Field("title", cols(1), Field.Store.YES, Field.Index.NO))
doc.add(new Field("claim", cols(2), Field.Store.YES, Field.Index.ANALYZED))
doc.add(new Field("contents", cols(3), Field.Store.YES, Field.Index.NO))
writer.addDocument(doc)
}

def main(args : Array[String]){
val infile = args(0)
val outfile = args(1)
val writer = new IndexWriter(FSDirectory.open(new File(outfile)),
new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED )
new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"), true, IndexWriter.MaxFieldLength.LIMITED )

Source.fromFile(new File(infile)).getLines("\n").foreach{line =>
writer.addDocument(docForLine(line))
addDocForFullLine(writer,line)
// writer.addDocument(docForLine(line))
}

writer.optimize()
writer.close
}
}

// how similar are these two phrases?
// used for clustering, and for determining if something is disputed
// ideally want to pick a good known paraphrase algorithm
object PhraseCompare {
val anal = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English",stopWords)

def stopWords : Array[String] = {
val set = StopAnalyzer.ENGLISH_STOP_WORDS_SET
val arr = new Array[String](set.size)
set.toArray(arr)
arr
}

// based on "A metric for paraphrase detection"
def similarityLCP(phrase : String, other : String) : Double = {
val phrasetokens = tokens(phrase)
val othertokens = tokens(other)
var bestscore = 0.0
for(i <- 1 to 4){
val phrasengrams = ngrams(phrasetokens,i)
val otherngrams = ngrams(othertokens,i)
val overlap = phrasengrams.filter(ngram => otherngrams contains ngram)
val count_match = overlap.length.asInstanceOf[Double]
val count_total = phrasengrams.length + otherngrams.length - count_match
if(count_total > 0){
val score = count_match / count_total
bestscore = Math.max(bestscore,score)
}
}
return bestscore
}

// based on "A metric for paraphrase detection"
// seems to give pretty poor results
def similarityNgram(phrase : String, other : String) : Double = {
val phrasetokens = tokens(phrase)
val othertokens = tokens(other)
var sumscore = 0.0
val maxn = Math.min(4,phrasetokens.length)
for(i <- 1 to maxn){
val phrasengrams = ngrams(phrasetokens,i)
val otherngrams = ngrams(othertokens,i)
val overlap = phrasengrams.filter(ngram => otherngrams contains ngram)
val count_match = overlap.length.asInstanceOf[Double]
val count_total = phrasengrams.length + otherngrams.length - count_match
val score = count_match / count_total
sumscore += score
}
return sumscore / maxn
}

def similarityHasAll(phrase : String, other : String) : Boolean = {
val phrasetokens = tokens(phrase)
val othertokens = tokens(other)
!phrasetokens.exists(token => !othertokens.contains(token))
}

def similarityHasAllSameNeg(phrase : String, other : String) : Boolean
= similarityHasAll(phrase,other) && sameNeg(phrase,other)

def isNeg(phrase : String) = words(phrase).contains("not") || phrase.contains("n't")
def sameNeg(phrase : String, other : String) : Boolean = isNeg(phrase) == isNeg(other)

// based on "A word overlap baseline for the recognizing textual entailment task"
// TODO: remove stopwords
def similarityWordOverlap(phrase : String, other : String) : Double = {
val phrasetokens = tokens(phrase)
val othertokens = tokens(other)
val overlap = phrasetokens.filter(word => othertokens contains word)
val wordoverlap = overlap.length
val p = wordoverlap.asInstanceOf[Double] / phrasetokens.length
val r = wordoverlap.asInstanceOf[Double] / othertokens.length
if(p + r > 0){
(2.0 * p * r)/(p+r)
}else{
0.0
}
}

def isSimilar(phrase : String, other : String) = similarityHasAllSameNeg(phrase,other)

def words(phrase : String) : Seq[String] = phrase.split("\\s+")

def ngrams(words : Seq[String], n : Int) : Seq[String] = {
val ngrams = new ListBuffer[String]
for(i <- 0 to words.length - n){
ngrams += words.slice(i,i+n).mkString(" ")
}
ngrams
}

def tokens(phrase : String) : Seq[String] = {
val r = new StringReader(phrase.replace("n't",""))
val s = anal.tokenStream("contents",r)
val b = new ArrayBuffer[String]
while(s.incrementToken){
b += s.getAttribute(classOf[TermAttribute]).term
}
b
}
}

object LuceneSearch {
def main(args : Array[String]){
val indexfile = args(0)
def getResults(indexfile : String, query : String) : Array[Document] = {
val reader = IndexReader.open(FSDirectory.open(new File(indexfile)))
val searcher = new IndexSearcher(reader);
val analyser = new StandardAnalyzer(Version.LUCENE_CURRENT)
val parser = new QueryParser(Version.LUCENE_CURRENT, "contents", analyser)
val analyser = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English")
val parser = new QueryParser(Version.LUCENE_CURRENT, "claim", analyser)

val input = query.trim
val collector = TopScoreDocCollector.create(20, false)
searcher.search(parser.parse(input),collector)
val hits = collector.topDocs().scoreDocs
val results = hits.map {hit =>
searcher.doc(hit.doc)
}
reader.close
results
}

def isDisputed(indexfile : String, text : String) : Boolean = {
val results = getResults(indexfile,text)
results exists (doc => PhraseCompare.isSimilar(doc.get("claim"),text))
}

def main(args : Array[String]){
val in = new BufferedReader(new InputStreamReader(System.in,"UTF-8"))

while(true){
System.out.println("Enter query:")
val input = in.readLine().trim
val input = in.readLine()
System.out.println("Searching for : "+input)
val query = parser.parse(input)

val collector = TopScoreDocCollector.create(10, false)
searcher.search(query,collector)
System.out.println(collector.getTotalHits + " results")
val hits = collector.topDocs().scoreDocs
hits foreach {hit =>
val doc = searcher.doc(hit.doc)
System.out.println(hit.score + " - " + doc.get("contents"))
val results = getResults(args(0),input)
results foreach {doc =>
System.out.println(doc.get("claim") + "\n\t" + domainForUrl(doc.get("url")) + " - "
+ doc.get("title"))
}
}
//
// val indexfile = args(0)
// val reader = IndexReader.open(FSDirectory.open(new File(indexfile)))
// val searcher = new IndexSearcher(reader);
// val analyser = new StandardAnalyzer(Version.LUCENE_CURRENT)
// val parser = new QueryParser(Version.LUCENE_CURRENT, "contents", analyser)
//
// val in = new BufferedReader(new InputStreamReader(System.in,"UTF-8"))
//
// while(true){
// System.out.println("Enter query:")
// val input = in.readLine().trim
// System.out.println("Searching for : "+input)
// val query = parser.parse(input)
//
// val collector = TopScoreDocCollector.create(20, false)
// searcher.search(query,collector)
// System.out.println(collector.getTotalHits + " results")
// val hits = collector.topDocs().scoreDocs
// hits foreach {hit =>
// val doc = searcher.doc(hit.doc)
// System.out.println(hit.score + " - " + doc.get("contents"))
// }
// }
}

}

0 comments on commit acb83f6

Please sign in to comment.