Skip to content

Commit

Permalink
new nbsvm stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
sidaw committed Dec 13, 2015
0 parents commit c1c6be9
Show file tree
Hide file tree
Showing 45 changed files with 4,426 additions and 0 deletions.
59 changes: 59 additions & 0 deletions README.md
@@ -0,0 +1,59 @@
# NBSVM
SInce I still receive a good number of emails 4 years later,
I decided to put this code on github and write the
instructions better. The code is still just as bad.

For technical details see [our paper](wang12simple.pdf) and
[our talk](wang12simple_slides.pdf).

```
@inproceedings{wang12simple,
author = {Wang, Sida I. and Manning, Christopher D.},
booktitle = {Proceedings of the ACL},
title = {Baselines and Bigrams: Simple, Good Sentiment and Topic Classification},
year = {2012},
booktitle = {ACL (2)},
pages = {90-94}
}
```

## Running NBSVM
- Download the data and override the empty data directory in root: "data/rt10662/unigram_rts.mat"
- Go to src and run the script master.m to produce the results from the paper
- Results and details are logged in resultslog.txt and details.txt, respectively
- A table with all the results is printed to the screen

## The data
-
[data](http://www.stanford.edu/~sidaw/projects/data_NB_ACL12.zip) - 404.4MB includes all the data
-
[data_small](http://www.stanford.edu/~sidaw/projects/datasmall_NB_ACL12.zip) - 108.5MB
data_small = data_all - large_IMDB

- For each data set, there is a corresponding folder data/$DatasetName.
- You can find $FeatureType_$DatasetName.mat in data/$DatasetName, where
$FeatureType is unigram or bigram.
- data/$DatasetName/cv_obj.mat determines the standard evaluation for each dataset (how many
folds, whats the split, etc.). They are generated by corresponding
data processing script in src/misc

## Notes and comments
- The datasets are collected by others, please cite the original sources if you work with them
- The data structure used kept the order information of the document, instead of
converting to bag-of-words vector right away. This resulted in some
unnecessary mess for this work, but might make it easier if you want
to try a more complex model.
- While many experiments have been ran for this task, performance is
really all about regularization, and even the simplest model (Naive
Bayes) would fit the training set perfectly. As far as I know, there is no good
theory for why things even work in this case of non-sparse weights
and p>>n.
- Despite a number of highly cited papers that experimented on these same
datasets, I'm unsure if any of the complicated, deep learning models
today are doing significantly more than bag of words.
Available ompute power, engineering competence in addition to that no one
tries very hard anymore pushing
linear classifiers are some causes for concern.
- These models run in seconds or less, and
behaves predictably for a different test distribution.
- Another [example](http://arxiv.org/abs/1512.02167) of bag of words going strong.
31 changes: 31 additions & 0 deletions src/CV.m
@@ -0,0 +1,31 @@
params.dictsize = length(wordsbi);
params.numcases = length(labels);

fprintf('CV using dataset l=%d, dictSize=%d, CVNUM=%d\n', ...
length(allSNumBi), length(wordsbi), params.CVNUM)

% initial = 1.1;
randn('state', 0);
rand('state', 0);

allcounts = [];
allfps = []; allfns = [];
for i=[1:params.CVNUM]
train_ind = cv_obj.training(i);
test_ind = cv_obj.test(i);
assert(0==sum(train_ind == test_ind))

model = trainfuncp(allSNumBi(train_ind), labels(train_ind), params);
%
[acc pred softpred] = testfuncp(model, ...
allSNumBi(test_ind), labels(test_ind), params);

nblbltst = labels(test_ind);
fp = sum(nblbltst == 0 & pred == 1);
fn = sum(nblbltst == 1 & pred == 0);
allfps = [allfps fp];
allfns = [allfns fn];
allcounts = [allcounts acc];
end
allcounts
mean(allcounts)
23 changes: 23 additions & 0 deletions src/GTSmooth.m
@@ -0,0 +1,23 @@
function f_gt = GTSmooth(f)
f_gt = zeros(size(f));
ff = histc(f, 1:max(f));
adjfac = 1:(max(ff));
for c = 1:5
N_kp1 = ff(c+1);
N_k = ff(c);

adj = N_kp1 * (c + 1) / N_k;
adjfac(c) = adj;
end
adj0 = ff(1);
nz = sum(f == 0);
for i=1:length(f)
if f(i) ~= 0
f_gt(i) = adjfac(f(i));
else
f_gt(i) = adj0 ./ nz;
end
end



Binary file added src/compareACL.pdf
Binary file not shown.
197 changes: 197 additions & 0 deletions src/master.m
@@ -0,0 +1,197 @@
addpath('~/matlib/liblinear-1.8/matlab/');
params.C = 1;
params.samplenum = 1;
params.samplerate = 1;
params.Cbisvm = 0.1;
% this is the exponent used to discount raw counts
% set to 1 to use raw counts f,
% set to 0 to use indicators \hat{f}
params.testp = 0;
params.trainp = 0;

% params.a is the Laplacian smoothing parameter
params.a = 1;
% beta is the interpolation parameter
params.beta = 0.25;
allresults = containers.Map;

%% RT10662
params.CVNUM = 10;
params.doCV = 1;
% load the 10 fold cross validation split
load('../data/rt10662/cv_obj');

% load bigram data
load('../data/rt10662/bigram_rts.mat');
dataset = 'RTs';
gram = 'Bi';
masterCV;

% load unigram data
load('../data/rt10662/unigram_rts.mat');
dataset = 'RTs';
gram = 'Uni';
masterCV;

%% MPQA
params.CVNUM = 10;
params.doCV = 1;

load('../data/mpqa/cv_obj');

load('../data/mpqa/bigram_mpqa.mat');
dataset = 'MPQA';
gram = 'Bi';
masterCV;

load('../data/mpqa/unigram_mpqa.mat');
dataset = 'MPQA';
gram = 'Uni';
masterCV;


%% CR
params.CVNUM = 10;
params.doCV = 1;

load('../data/customerr/cv_obj');

load('../data/customerr/bigram_cr.mat');
dataset = 'CR';
gram = 'Bi';
masterCV;

load('../data/customerr/unigram_cr.mat');
dataset = 'CR';
gram = 'Uni';
masterCV;


%% subjectivity
params.CVNUM = 10;
params.doCV = 1;

load('../data/subj/bigram_subj.mat');
load('../data/subj/cv_obj', 'cv_obj');

% cv_obj = cvpartition(labels,'kfold',10);
% save('../data/subj/cv_obj', 'cv_obj');
% permutate labels randomly as a sanity check
% labels = labels(randperm(length(labels)));

dataset = 'subj';
gram = 'Bi';
masterCV;

load('../data/subj/unigram_subj.mat');
dataset = 'subj';
gram = 'Uni';
masterCV;

%% RT2k
params.CVNUM = 10;
params.doCV = 1;

load('../data/rt2k/cv_obj');

load('../data/rt2k/bigram_rt2k.mat');
dataset = 'RT-2k';
gram = 'Bi';
masterCV;

load('../data/rt2k/unigram_rt2k.mat');
dataset = 'RT-2k';
gram = 'Uni';
masterCV;

%% long movie reviews

params.CVNUM = 1;
params.doCV = 0;
firsthalf = false(1,50000);
firsthalf(1:25000) = true;
train_ind = firsthalf;
test_ind = ~firsthalf;

% bigram
load('../data/mrl/bigram_mrl_striponlybi.mat');
dataset = 'IMDB';
gram = 'Bi';
masterCV;

% unigram
load('../data/mrl/unigram_mrl2.mat');
dataset = 'IMDB';
gram = 'Uni';
masterCV;



%% 20NG ath vs. religion
params.CVNUM = 2;
params.doCV = 0;
params.Cbisvm = 0.1;

load('../data/20ng/bigram_ng20_atheisms_strip_noheader.mat');
% Spliting training/testing data randomly
% cv_obj = cvpartition(labels,'kfold',2);
% save('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj')
load('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj');
train_ind = cv_obj.training(1);
test_ind = cv_obj.test(1);

dataset = 'AthR';
gram = 'Bi';
masterCV;

load('../data/20ng/unigram_ng20_atheisms_strip_noheader.mat');
dataset = 'AthR';
gram = 'Uni';
masterCV;

%% 20NG windows vs. graphics
params.CVNUM = 2;
params.doCV = 0;

load('../data/20ng/bigram_ng20_windows_strip_noheader.mat');
% Spliting training/testing data randomly
% cv_obj = cvpartition(labels,'kfold',2);
% save('../data/20ng/cv_obj_windows.mat', 'cv_obj')
load('../data/20ng/cv_obj_windows.mat', 'cv_obj');

train_ind = cv_obj.training(1);
test_ind = cv_obj.test(1);
dataset = 'XGraph';
gram = 'Bi';
masterCV;

load('../data/20ng/unigram_ng20_windows_strip_noheader.mat');
dataset = 'XGraph';
gram = 'Uni';
masterCV;

%% 20NG base vs. crpyt
params.CVNUM = 2;
params.doCV = 0;

load('../data/20ng/bigram_ng20_baseball_strip_noheader.mat');
% Spliting training/testing data randomly
% cv_obj = cvpartition(labels,'kfold',2);
% save('../data/20ng/cv_obj_baseball.mat', 'cv_obj')
load('../data/20ng/cv_obj_baseball.mat', 'cv_obj');

train_ind = cv_obj.training(1);
test_ind = cv_obj.test(1);

dataset = 'BbCrypt';
gram = 'Bi';
masterCV;

load('../data/20ng/unigram_ng20_baseball_strip_noheader.mat');
dataset = 'BbCrypt';
gram = 'Uni';
masterCV;


save(['allresults_' datestr(now)], 'allresults');
printTable(allresults)
59 changes: 59 additions & 0 deletions src/masterCV.m
@@ -0,0 +1,59 @@
params.dictsize = length(wordsbi);
params.numcases = length(labels);

cdataset = dataset;
c = 1;
if ~exist('pdataset', 'var') || ~strcmp(cdataset, pdataset)
currentres = zeros(2,3);
pdataset = cdataset;
end

if ~isfield(params, 'doCV')
params.doCV = 1;
end

trainfuncp = @(allSNumBi, labels, params) trainMNB(allSNumBi, labels, params);
testfuncp = @(model, allSNumBi, labels, params) testMNB(model, allSNumBi, labels, params);

if params.doCV
CV;
else
trainTest;
end

writeResults(allcounts, params, [dataset '-SVM-' gram]);
allresults([dataset, '-MNB-', gram]) = mean(allcounts);
% trainfuncp = @(allSNumBi, labels, params) trainBNB(allSNumBi, labels, params);
% testfuncp = @(model, allSNumBi, labels, params) testBNB(model, allSNumBi, labels, params);
%
% if params.doCV
% CV;
% else
% trainTest;
% end
%
% writeResults(allcounts, params, [dataset '-BNB-' gram]);
% allresults([dataset, '-BNB-', gram]) = mean(allcounts);
trainfuncp = @(allSNumBi, labels, params) trainbisvm(allSNumBi, labels, params);
testfuncp = @(model, allSNumBi, labels, params) testbisvm(model, allSNumBi, labels, params);

if params.doCV
CV;
else
trainTest;
end

writeResults(allcounts, params, [dataset '-SVM-' gram]);
allresults([dataset, '-SVM-', gram]) = mean(allcounts);


trainfuncp = @(allSNumBi, labels, params) trainMNBSVM(allSNumBi, labels, params);
testfuncp = @(model, allSNumBi, labels, params) testMNBSVM(model, allSNumBi, labels, params);
if params.doCV
CV;
else
trainTest;
end

writeResults(allcounts, params, [dataset '-NBSVM-' gram]);
allresults([dataset, '-NBSVM-', gram]) = mean(allcounts);

0 comments on commit c1c6be9

Please sign in to comment.