new nbsvm stuff

sidaw · Dec 13, 2015 · c1c6be9 · c1c6be9
commit c1c6be9
Show file tree

Hide file tree

Showing 45 changed files with 4,426 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+# NBSVM
+SInce I still receive a good number of emails 4 years later,
+I decided to put this code on github and write the
+instructions better. The code is still just as bad.
+
+For technical details see [our paper](wang12simple.pdf) and
+[our talk](wang12simple_slides.pdf).
+
+```
+@inproceedings{wang12simple, 
+author = {Wang, Sida I. and Manning, Christopher D.}, 
+booktitle = {Proceedings of the ACL}, 
+title = {Baselines and Bigrams: Simple, Good Sentiment and Topic Classification}, 
+year = {2012}, 
+booktitle = {ACL (2)}, 
+pages = {90-94} 
+} 
+```
+
+## Running NBSVM
+- Download the data and override the empty data directory in root: "data/rt10662/unigram_rts.mat"
+- Go to src and run the script master.m to produce the results from the paper
+- Results and details are logged in resultslog.txt and details.txt, respectively
+- A table with all the results is printed to the screen
+
+## The data
+-
+  [data](http://www.stanford.edu/~sidaw/projects/data_NB_ACL12.zip) - 404.4MB includes all the data
+-
+  [data_small](http://www.stanford.edu/~sidaw/projects/datasmall_NB_ACL12.zip) - 108.5MB
+  data_small = data_all - large_IMDB
+
+- For each data set, there is a corresponding folder data/$DatasetName.
+- You can find $FeatureType_$DatasetName.mat in data/$DatasetName, where
+$FeatureType is unigram or bigram.
+- data/$DatasetName/cv_obj.mat determines the standard evaluation for each dataset (how many
+  folds, whats the split, etc.). They are generated by corresponding
+  data processing script in src/misc
+
+## Notes and comments
+- The datasets are collected by others, please cite the original sources if you work with them
+- The data structure used kept the order information of the document, instead of
+converting to bag-of-words vector right away. This resulted in some
+unnecessary mess for this work, but might make it easier if you want
+to try a more complex model.
+- While many experiments have been ran for this task, performance is
+  really all about regularization, and even the simplest model (Naive
+  Bayes) would fit the training set perfectly. As far as I know, there is no good
+  theory for why things even work in this case of non-sparse weights
+  and p>>n.
+- Despite a number of highly cited papers that experimented on these same
+  datasets, I'm unsure if any of the complicated, deep learning models
+  today are doing significantly more than bag of words.
+  Available ompute power, engineering competence in addition to that no one
+  tries very hard anymore pushing
+  linear classifiers are some causes for concern.
+- These models run in seconds or less, and
+  behaves predictably for a different test distribution.
+- Another [example](http://arxiv.org/abs/1512.02167) of bag of words going strong.
diff --git a/src/CV.m b/src/CV.m
@@ -0,0 +1,31 @@
+params.dictsize = length(wordsbi);
+params.numcases = length(labels);
+
+fprintf('CV using dataset l=%d, dictSize=%d, CVNUM=%d\n', ...
+    length(allSNumBi), length(wordsbi), params.CVNUM)
+
+% initial = 1.1;
+randn('state', 0);
+rand('state', 0);
+
+allcounts = [];
+allfps = []; allfns = [];
+for i=[1:params.CVNUM]
+    train_ind = cv_obj.training(i);
+    test_ind = cv_obj.test(i);
+    assert(0==sum(train_ind == test_ind))
+
+    model = trainfuncp(allSNumBi(train_ind), labels(train_ind), params);
+%     
+    [acc pred softpred] = testfuncp(model, ...
+        allSNumBi(test_ind), labels(test_ind), params);
+
+    nblbltst = labels(test_ind);
+    fp = sum(nblbltst == 0 & pred == 1);
+    fn = sum(nblbltst == 1 & pred == 0);
+    allfps = [allfps fp];
+    allfns = [allfns fn];
+    allcounts = [allcounts acc];
+end
+allcounts
+mean(allcounts)
diff --git a/src/GTSmooth.m b/src/GTSmooth.m
@@ -0,0 +1,23 @@
+function f_gt = GTSmooth(f)
+  f_gt  = zeros(size(f));
+  ff = histc(f, 1:max(f));
+  adjfac = 1:(max(ff));
+  for c = 1:5
+    N_kp1 = ff(c+1);
+    N_k = ff(c);
+
+    adj = N_kp1 * (c + 1) /  N_k;
+    adjfac(c) = adj;
+  end
+  adj0 = ff(1);
+  nz = sum(f == 0);
+  for i=1:length(f)
+    if f(i) ~= 0
+      f_gt(i) = adjfac(f(i));
+    else
+      f_gt(i) = adj0 ./ nz;
+    end
+  end
+
+
+
diff --git a/src/compareACL.pdf b/src/compareACL.pdf
diff --git a/src/master.m b/src/master.m
@@ -0,0 +1,197 @@
+addpath('~/matlib/liblinear-1.8/matlab/');
+params.C = 1;
+params.samplenum = 1;
+params.samplerate = 1;
+params.Cbisvm = 0.1;
+% this is the exponent used to discount raw counts
+% set to 1 to use raw counts f, 
+% set to 0 to use indicators \hat{f}
+params.testp = 0;
+params.trainp = 0;
+
+% params.a is the Laplacian smoothing parameter
+params.a = 1;
+% beta is the interpolation parameter
+params.beta = 0.25;
+allresults = containers.Map;
+
+%% RT10662
+params.CVNUM = 10;
+params.doCV = 1;
+% load the 10 fold cross validation split
+load('../data/rt10662/cv_obj');
+
+% load bigram data
+load('../data/rt10662/bigram_rts.mat');
+dataset = 'RTs';
+gram = 'Bi';
+masterCV;
+
+% load unigram data
+load('../data/rt10662/unigram_rts.mat');
+dataset = 'RTs';
+gram = 'Uni';
+masterCV;
+
+%% MPQA
+params.CVNUM = 10;
+params.doCV = 1;
+
+load('../data/mpqa/cv_obj');
+
+load('../data/mpqa/bigram_mpqa.mat');
+dataset = 'MPQA';
+gram = 'Bi';
+masterCV;
+
+load('../data/mpqa/unigram_mpqa.mat');
+dataset = 'MPQA';
+gram = 'Uni';
+masterCV;
+
+
+%% CR
+params.CVNUM = 10;
+params.doCV = 1;
+
+load('../data/customerr/cv_obj');
+
+load('../data/customerr/bigram_cr.mat');
+dataset = 'CR';
+gram = 'Bi';
+masterCV;
+
+load('../data/customerr/unigram_cr.mat');
+dataset = 'CR';
+gram = 'Uni';
+masterCV;
+
+
+%% subjectivity
+params.CVNUM = 10;
+params.doCV = 1;
+
+load('../data/subj/bigram_subj.mat');
+load('../data/subj/cv_obj', 'cv_obj');
+
+% cv_obj = cvpartition(labels,'kfold',10);
+% save('../data/subj/cv_obj', 'cv_obj');
+% permutate labels randomly as a sanity check
+% labels = labels(randperm(length(labels)));
+
+dataset = 'subj';
+gram = 'Bi';
+masterCV;
+
+load('../data/subj/unigram_subj.mat');
+dataset = 'subj';
+gram = 'Uni';
+masterCV;
+
+%% RT2k
+params.CVNUM = 10;
+params.doCV = 1;
+
+load('../data/rt2k/cv_obj');
+
+load('../data/rt2k/bigram_rt2k.mat');
+dataset = 'RT-2k';
+gram = 'Bi';
+masterCV;
+
+load('../data/rt2k/unigram_rt2k.mat');
+dataset = 'RT-2k';
+gram = 'Uni';
+masterCV;
+
+%% long movie reviews
+
+params.CVNUM = 1;
+params.doCV = 0;
+firsthalf = false(1,50000);
+firsthalf(1:25000) = true;
+train_ind = firsthalf;
+test_ind = ~firsthalf;
+
+% bigram
+load('../data/mrl/bigram_mrl_striponlybi.mat');
+dataset = 'IMDB';
+gram = 'Bi';
+masterCV;
+
+% unigram
+load('../data/mrl/unigram_mrl2.mat');
+dataset = 'IMDB';
+gram = 'Uni';
+masterCV;
+
+
+
+%% 20NG ath vs. religion
+params.CVNUM = 2;
+params.doCV = 0;
+params.Cbisvm = 0.1;
+
+load('../data/20ng/bigram_ng20_atheisms_strip_noheader.mat');
+% Spliting training/testing data randomly
+% cv_obj = cvpartition(labels,'kfold',2);
+% save('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj')
+load('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj');
+train_ind = cv_obj.training(1);
+test_ind = cv_obj.test(1);
+
+dataset = 'AthR';
+gram = 'Bi';
+masterCV;
+
+load('../data/20ng/unigram_ng20_atheisms_strip_noheader.mat');
+dataset = 'AthR';
+gram = 'Uni';
+masterCV;
+
+%% 20NG windows vs. graphics
+params.CVNUM = 2;
+params.doCV = 0;
+
+load('../data/20ng/bigram_ng20_windows_strip_noheader.mat');
+% Spliting training/testing data randomly
+% cv_obj = cvpartition(labels,'kfold',2);
+% save('../data/20ng/cv_obj_windows.mat', 'cv_obj')
+load('../data/20ng/cv_obj_windows.mat', 'cv_obj');
+
+train_ind = cv_obj.training(1);
+test_ind = cv_obj.test(1);
+dataset = 'XGraph';
+gram = 'Bi';
+masterCV;
+
+load('../data/20ng/unigram_ng20_windows_strip_noheader.mat');
+dataset = 'XGraph';
+gram = 'Uni';
+masterCV;
+
+%% 20NG base vs. crpyt
+params.CVNUM = 2;
+params.doCV = 0;
+
+load('../data/20ng/bigram_ng20_baseball_strip_noheader.mat');
+% Spliting training/testing data randomly
+% cv_obj = cvpartition(labels,'kfold',2);
+% save('../data/20ng/cv_obj_baseball.mat', 'cv_obj')
+load('../data/20ng/cv_obj_baseball.mat', 'cv_obj');
+
+train_ind = cv_obj.training(1);
+test_ind = cv_obj.test(1);
+
+dataset = 'BbCrypt';
+gram = 'Bi';
+masterCV;
+
+load('../data/20ng/unigram_ng20_baseball_strip_noheader.mat');
+dataset = 'BbCrypt';
+gram = 'Uni';
+masterCV;
+
+
+save(['allresults_' datestr(now)], 'allresults');
+printTable(allresults)
diff --git a/src/masterCV.m b/src/masterCV.m
@@ -0,0 +1,59 @@
+params.dictsize = length(wordsbi);
+params.numcases = length(labels);
+
+cdataset = dataset;
+c = 1;
+if ~exist('pdataset', 'var') || ~strcmp(cdataset, pdataset)
+    currentres = zeros(2,3); 
+    pdataset = cdataset;
+end
+
+if ~isfield(params, 'doCV')
+ params.doCV = 1;
+end
+
+trainfuncp = @(allSNumBi, labels, params) trainMNB(allSNumBi, labels, params);
+testfuncp = @(model, allSNumBi, labels, params) testMNB(model, allSNumBi, labels, params);
+
+if params.doCV
+    CV;
+else
+    trainTest;
+end
+
+writeResults(allcounts, params, [dataset '-SVM-' gram]);
+allresults([dataset, '-MNB-', gram]) = mean(allcounts);
+% trainfuncp = @(allSNumBi, labels, params) trainBNB(allSNumBi, labels, params);
+% testfuncp = @(model, allSNumBi, labels, params) testBNB(model, allSNumBi, labels, params);
+% 
+% if params.doCV
+%     CV;
+% else
+%     trainTest;
+% end
+% 
+% writeResults(allcounts, params, [dataset '-BNB-' gram]);
+% allresults([dataset, '-BNB-', gram]) = mean(allcounts);
+trainfuncp = @(allSNumBi, labels, params) trainbisvm(allSNumBi, labels, params);
+testfuncp = @(model, allSNumBi, labels, params) testbisvm(model, allSNumBi, labels, params);
+
+if params.doCV
+    CV;
+else
+    trainTest;
+end
+
+writeResults(allcounts, params, [dataset '-SVM-' gram]);
+allresults([dataset, '-SVM-', gram]) = mean(allcounts);
+
+
+trainfuncp = @(allSNumBi, labels, params) trainMNBSVM(allSNumBi, labels, params);
+testfuncp = @(model, allSNumBi, labels, params) testMNBSVM(model, allSNumBi, labels, params);
+if params.doCV
+    CV;
+else
+    trainTest;
+end
+
+writeResults(allcounts, params, [dataset '-NBSVM-' gram]);
+allresults([dataset, '-NBSVM-', gram]) = mean(allcounts);