porting in some old material

sbenthall · May 8, 2012 · fcdc46e · fcdc46e
1 parent 30e0353
commit fcdc46e
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 47 deletions.
diff --git a/classify.py b/classify.py
@@ -19,7 +19,10 @@ def __init__(self,tm,sm):
         self.sm = sm
 
     def features(self,i):
-        return dict([t for t in enumerate(self.tm[i,:])])
+        if len(self.tm.shape) == 1:
+            return {"1": self.tm[i]}
+        else:
+            return dict([t for t in enumerate(self.tm[i,:])])
 
     def training_example(self,i):
         label = "spam" if self.sm[i] else "ham"
@@ -47,18 +50,15 @@ def test(self,classifier, testing_indices):
         accurate_test_results = [label == ('spam' if self.sm[testing_indices[i]] else 'ham') for i,label in enumerate(test_results)]
         return test_results, accurate_test_results
 
-    def ratio(self,data):
-        return float(sum(data)) / len(data)
-
     def compute_classifier_success(self,n_for_validation=N):
         ratios = []
 
         for n in range(N):
             train_i, test_i = self.sample_indices()
             classifier = self.train(train_i)
             results, accuracy = self.test(classifier, test_i)
-
-            ratios.append(self.ratio(accuracy))
+            import pdb; pdb.set_trace()
+            ratios.append(numpy.mean(accuracy))
 
         mean_success = numpy.mean(ratios)
 

diff --git a/cs261-writeup/finalreport.tex b/cs261-writeup/finalreport.tex
@@ -27,13 +27,65 @@
 
 \section{Introduction}
 
+Links can hold surprises.
+In social media especially, users encounter links in fleeting messages that may contain little context.
+This environment enables deception in the presentation of links.
+Socially, deception can be a game, as in the rick-rolling prank, whereby a user claims to be sharing a serious article but instead links to a dated music video.
+Is link deception used in spam and cybercrime?
+Our study investigates this connection in the context of Twitter.
+
+We operationalize deception as the similarity between a message (tweet) containing a link and the content of the HTML to which the link resolves.
+As deception involves an element of subjective interpretation, we first use an algorithmic method of approximating latent semantic structure: topic modeling.
+Topic models provide a means for dimensionality reduction of lexical features (words) in a way can capture interpretable categories of content.
+
+Our guiding hypothesis is that spammy or criminal tweets will have more dissonance between their contextual information and linked content.
+We test this hypothesis using data provided by the Monarch project.
+
+Vern says: The big question I had concerns
+the future work, for which you frame two possible approaches, grouping
+tweets or using text extracted from the crawl, and state you're prioritizing
+the former.  My sense of what was particularly interesting about this
+project was the possibility of the latter, i.e., identifying dissonance
+between tweet topic and web page content.  What has you thinking of not
+pursuing that but instead grouping tweets?
+
 \section{Prior Work}
 
 \section{The dataset}
 
+Monarch data
+
+
+
 \section{Language detection for tweets}
 
 \section{Topic modeling on twitter}
+
+Data cleaning 
+
+stopword removal
+
+
+
+Vern says: Your writeup should make some things clear that here weren't so much, such as just what constitutes a "document" (a single tweet? that appeared to be the meaning), how LDA works (don't assume I know - because I don't!), what you mean by "parsity" (sparsity?), and what the reader is supposed to make of the      LDA output in the appendix.
+
+Our  goal for the first segment of our research was to train a classifier  based on learned topics from the data and troubleshoot the process along  the way.
+
+We received a sample of the Monarch data from Chris Grier, and limited our study to Twitter data (as opposed to emails).  In order to limit our preliminary results to those that we could interpret, we further filtered this data to include only English language tweets.  We detected tweet language by computing lexical compressibility against corpuses of English, French, Chinese, and other languages.
+
+Focusing on the tweet messages themselves, we cleaned this data by generating a list of stopwords based on commonly used English stopwords as well as informally based on observed term frequency in order to account for the peculiarities of Twitter lexical feature frequency.
+
+Running the standard LDA algorithm on these short messages produced two results:
+	* A list of topics, labeled with 'topic keys'  the most prominent words used in the topics
+	* An inferred distribution of topics per document
+
+The results from the topic keys were promising, with several of the topics corresponding to domain knowledge of what constitutes spam.  For example. one topic was characterized by these keywords:
+
+news world iphone win tv apple google watch ipad ap hey  app price search sports click buy giveaway case ipod late shows gb touch  tech test year deal series 
+
+Using the topic distribution for each document as a feature vector, we trained a naive Bayes classifier and tested its spam detection accuracy against the labeled data, using n-fold cross validation.
+
+The performance of this classifier was underwhelming (.57 accuracy against a spam base rate of .73).  However, this is unsurprising given the parsity of data we were using for this particular iteration.
 \subsection{Clean-up process}
 
 \section{Detecting deceptive tweets}

diff --git a/ldasimlib.py b/ldasimlib.py
@@ -97,14 +97,16 @@ def tweet_bags2ldanpy(tweet_bags,ldamodel,save_npy=True,num_topics=100):
 
     tweet_topic_data = zeros((num_bow,num_topics))
     html_topic_data = zeros((num_bow,num_topics))
+    sim_data = zeros(num_bow)
     spam_data = zeros(num_bow)
 
     for i,key in enumerate(tweet_bags.keys()):
         entry = tweet_bags[key]
         spam_data[i] = 1 if entry["is_spam"] else 0
         twv = ldamodel[entry["tw"]]
         htv = ldamodel[entry["ht"]]
-
+        sim_data[i] = cosine(twv,htv)
+
         for topic,value in twv:
             tweet_topic_data[i][topic] = value
         for topic,value in htv:
@@ -115,7 +117,7 @@ def tweet_bags2ldanpy(tweet_bags,ldamodel,save_npy=True,num_topics=100):
         save('html_topic_data',html_topic_data)
         save('spam_data',spam_data)
 
-    return tweet_topic_data, html_topic_data, spam_data
+    return tweet_topic_data, html_topic_data, sim_data, spam_data
 
 # compute jaccard similarity between two bags of words
 def jaccard(tw,ht,model='dummy'):

diff --git a/ldasimscript.py b/ldasimscript.py
@@ -3,47 +3,37 @@
 from ldasimlib import *
 import cPickle
 import gensim
-#import classify
+import classify
 from zlib import decompress
-import sys
-
-n_topics = int(sys.argv[1])
-print "Called with n_topics=%d"%n_topics
-
-HTML_DATA = 'data/html.db'
+from numpy import mean
 
 print 'Loading data'
 
-i2w, w2i = cPickle.load(open('data/i2w-w2i.db','r'))
-html=cPickle.load(open(HTML_DATA, 'r'))
+i2w, w2i = cPickle.load(open('i2w-w2i.db','r'))
+html=cPickle.load(open('html.db', 'r'))
 
-#print 'Loading corpus'
-#corpus = cPickle.load(open('corpus.pkl','r'))
-#print 'Saving model'
-#model.save('model.pkl')
 #print 'Loading LDA model'
 #ldamodel = gensim.models.ldamodel.LdaModel.load('model.pkl')
 
-#print 'Loading tweets'
-#tweets=cPickle.load(open('../data/tweets+.db', 'r'))
+print 'Loading tweets'
+tweets=cPickle.load(open('tweets+.db', 'r'))
 
-#print 'Filtering tweets'
-#itweet_bags = filter_tweets(tweets,html,w2i,cutoff=5000)
-#cPickle.dump(tweet_bags,open('tweet-bags.pkl','w'))
+print 'Filtering tweets'
+tweet_bags = filter_tweets(tweets,html,w2i,cutoff=5000)
+cPickle.dump(tweet_bags,open('tweet-bags.pkl','w'))
 
 #print 'Computing similarities with Jaccard'
 #ham,spam = ham_spam_similarities(tweet_bags,jaccard,tweets)
 #make_cdf(ham,spam, 'Jaccard')
 
 def ldasimtest(num_topics):
-    print 'Generating model for %d topics'%num_topics
-    ldamodel=gensim.models.ldamodel.LdaModel(build_corpus_iterator(html, w2i, cutoff=None), id2word=i2w, num_topics=num_topics)
-
-    print 'Saving model'
-    ldamodel.save("LDAmodel-%dt.pkl" % (num_topics))
+    #print 'Generating model'
+    ldamodel=gensim.models.ldamodel.LdaModel(build_corpus_iterator(html,w2i), id2word=i2w, num_topics=num_topics)
 
-    """print 'Generating Numpy arrays for spam, tweet topics, and html topics'
-    tweet_topic_data, html_topic_data, spam_data = tweet_bags2ldanpy(tweet_bags,ldamodel,num_topics=num_topics)
+    #cPickle.dump(ldamodel,open("LDAmodel-%dt.pkl" % (num_topics),'w'))
+
+    print 'Generating Numpy arrays for spam, tweet topics, and html topics'
+    tweet_topic_data, html_topic_data, sim_data, spam_data = tweet_bags2ldanpy(tweet_bags,ldamodel,num_topics=num_topics)
 
     print 'Creating LdaSpamClassifier object for testing classifiers'
     classifier = classify.LdaSpamClassifierTester(tweet_topic_data,spam_data)
@@ -52,22 +42,20 @@ def ldasimtest(num_topics):
     classifier = classify.LdaSpamClassifierTester(html_topic_data,spam_data)
     html_success = classifier.compute_classifier_success(8)
 
-    print "Tweet success: %f, HTML success: %f"%(tweet_success,html_success)
-
-    print 'Computing similarities with LDA Cosine'
-    ham,spam = ham_spam_similarities(tweet_bags,lda_cosine,tweets,ldamodel)
-    make_cdf(ham,spam, 'lda cosine',file_name="ldacosine_cdf_%d"%(num_topics))"""
-
-ldasimtest(n_topics)
+    classifier = classify.LdaSpamClassifierTester(sim_data,spam_data)
+    sim_success = classifier.compute_classifier_success(8)
 
-#ldasimtest(5)
-#ldasimtest(10)
-#ldasimtest(20)
-#ldasimtest(40)
-#ldasimtest(80)
-
-exit(0)
+    print "Tweet success: %f, HTML success: %f, Sim success: %f"%(tweet_success,html_success, sim_success)
 
+    print 'Computing similarities with LDA Cosine'
+    #ham,spam = ham_spam_similarities(tweet_bags,lda_cosine,tweets,ldamodel)
+    #make_cdf(ham,spam, 'lda cosine',file_name="ldacosine_cdf_%d"%(num_topics))
+
+ldasimtest(5)
+ldasimtest(10)
+ldasimtest(20)
+ldasimtest(40)
+ldasimtest(80)
 
 print 'Generating TF-IDF model'
 tfidfmodel = gensim.models.tfidfmodel.TfidfModel(build_corpus_iterator(html,w2i))