Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit c1c6be9
Showing
45 changed files
with
4,426 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# NBSVM | ||
SInce I still receive a good number of emails 4 years later, | ||
I decided to put this code on github and write the | ||
instructions better. The code is still just as bad. | ||
|
||
For technical details see [our paper](wang12simple.pdf) and | ||
[our talk](wang12simple_slides.pdf). | ||
|
||
``` | ||
@inproceedings{wang12simple, | ||
author = {Wang, Sida I. and Manning, Christopher D.}, | ||
booktitle = {Proceedings of the ACL}, | ||
title = {Baselines and Bigrams: Simple, Good Sentiment and Topic Classification}, | ||
year = {2012}, | ||
booktitle = {ACL (2)}, | ||
pages = {90-94} | ||
} | ||
``` | ||
|
||
## Running NBSVM | ||
- Download the data and override the empty data directory in root: "data/rt10662/unigram_rts.mat" | ||
- Go to src and run the script master.m to produce the results from the paper | ||
- Results and details are logged in resultslog.txt and details.txt, respectively | ||
- A table with all the results is printed to the screen | ||
|
||
## The data | ||
- | ||
[data](http://www.stanford.edu/~sidaw/projects/data_NB_ACL12.zip) - 404.4MB includes all the data | ||
- | ||
[data_small](http://www.stanford.edu/~sidaw/projects/datasmall_NB_ACL12.zip) - 108.5MB | ||
data_small = data_all - large_IMDB | ||
|
||
- For each data set, there is a corresponding folder data/$DatasetName. | ||
- You can find $FeatureType_$DatasetName.mat in data/$DatasetName, where | ||
$FeatureType is unigram or bigram. | ||
- data/$DatasetName/cv_obj.mat determines the standard evaluation for each dataset (how many | ||
folds, whats the split, etc.). They are generated by corresponding | ||
data processing script in src/misc | ||
|
||
## Notes and comments | ||
- The datasets are collected by others, please cite the original sources if you work with them | ||
- The data structure used kept the order information of the document, instead of | ||
converting to bag-of-words vector right away. This resulted in some | ||
unnecessary mess for this work, but might make it easier if you want | ||
to try a more complex model. | ||
- While many experiments have been ran for this task, performance is | ||
really all about regularization, and even the simplest model (Naive | ||
Bayes) would fit the training set perfectly. As far as I know, there is no good | ||
theory for why things even work in this case of non-sparse weights | ||
and p>>n. | ||
- Despite a number of highly cited papers that experimented on these same | ||
datasets, I'm unsure if any of the complicated, deep learning models | ||
today are doing significantly more than bag of words. | ||
Available ompute power, engineering competence in addition to that no one | ||
tries very hard anymore pushing | ||
linear classifiers are some causes for concern. | ||
- These models run in seconds or less, and | ||
behaves predictably for a different test distribution. | ||
- Another [example](http://arxiv.org/abs/1512.02167) of bag of words going strong. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
params.dictsize = length(wordsbi); | ||
params.numcases = length(labels); | ||
|
||
fprintf('CV using dataset l=%d, dictSize=%d, CVNUM=%d\n', ... | ||
length(allSNumBi), length(wordsbi), params.CVNUM) | ||
|
||
% initial = 1.1; | ||
randn('state', 0); | ||
rand('state', 0); | ||
|
||
allcounts = []; | ||
allfps = []; allfns = []; | ||
for i=[1:params.CVNUM] | ||
train_ind = cv_obj.training(i); | ||
test_ind = cv_obj.test(i); | ||
assert(0==sum(train_ind == test_ind)) | ||
|
||
model = trainfuncp(allSNumBi(train_ind), labels(train_ind), params); | ||
% | ||
[acc pred softpred] = testfuncp(model, ... | ||
allSNumBi(test_ind), labels(test_ind), params); | ||
|
||
nblbltst = labels(test_ind); | ||
fp = sum(nblbltst == 0 & pred == 1); | ||
fn = sum(nblbltst == 1 & pred == 0); | ||
allfps = [allfps fp]; | ||
allfns = [allfns fn]; | ||
allcounts = [allcounts acc]; | ||
end | ||
allcounts | ||
mean(allcounts) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
function f_gt = GTSmooth(f) | ||
f_gt = zeros(size(f)); | ||
ff = histc(f, 1:max(f)); | ||
adjfac = 1:(max(ff)); | ||
for c = 1:5 | ||
N_kp1 = ff(c+1); | ||
N_k = ff(c); | ||
|
||
adj = N_kp1 * (c + 1) / N_k; | ||
adjfac(c) = adj; | ||
end | ||
adj0 = ff(1); | ||
nz = sum(f == 0); | ||
for i=1:length(f) | ||
if f(i) ~= 0 | ||
f_gt(i) = adjfac(f(i)); | ||
else | ||
f_gt(i) = adj0 ./ nz; | ||
end | ||
end | ||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
addpath('~/matlib/liblinear-1.8/matlab/'); | ||
params.C = 1; | ||
params.samplenum = 1; | ||
params.samplerate = 1; | ||
params.Cbisvm = 0.1; | ||
% this is the exponent used to discount raw counts | ||
% set to 1 to use raw counts f, | ||
% set to 0 to use indicators \hat{f} | ||
params.testp = 0; | ||
params.trainp = 0; | ||
|
||
% params.a is the Laplacian smoothing parameter | ||
params.a = 1; | ||
% beta is the interpolation parameter | ||
params.beta = 0.25; | ||
allresults = containers.Map; | ||
|
||
%% RT10662 | ||
params.CVNUM = 10; | ||
params.doCV = 1; | ||
% load the 10 fold cross validation split | ||
load('../data/rt10662/cv_obj'); | ||
|
||
% load bigram data | ||
load('../data/rt10662/bigram_rts.mat'); | ||
dataset = 'RTs'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
% load unigram data | ||
load('../data/rt10662/unigram_rts.mat'); | ||
dataset = 'RTs'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
%% MPQA | ||
params.CVNUM = 10; | ||
params.doCV = 1; | ||
|
||
load('../data/mpqa/cv_obj'); | ||
|
||
load('../data/mpqa/bigram_mpqa.mat'); | ||
dataset = 'MPQA'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/mpqa/unigram_mpqa.mat'); | ||
dataset = 'MPQA'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
|
||
%% CR | ||
params.CVNUM = 10; | ||
params.doCV = 1; | ||
|
||
load('../data/customerr/cv_obj'); | ||
|
||
load('../data/customerr/bigram_cr.mat'); | ||
dataset = 'CR'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/customerr/unigram_cr.mat'); | ||
dataset = 'CR'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
|
||
%% subjectivity | ||
params.CVNUM = 10; | ||
params.doCV = 1; | ||
|
||
load('../data/subj/bigram_subj.mat'); | ||
load('../data/subj/cv_obj', 'cv_obj'); | ||
|
||
% cv_obj = cvpartition(labels,'kfold',10); | ||
% save('../data/subj/cv_obj', 'cv_obj'); | ||
% permutate labels randomly as a sanity check | ||
% labels = labels(randperm(length(labels))); | ||
|
||
dataset = 'subj'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/subj/unigram_subj.mat'); | ||
dataset = 'subj'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
%% RT2k | ||
params.CVNUM = 10; | ||
params.doCV = 1; | ||
|
||
load('../data/rt2k/cv_obj'); | ||
|
||
load('../data/rt2k/bigram_rt2k.mat'); | ||
dataset = 'RT-2k'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/rt2k/unigram_rt2k.mat'); | ||
dataset = 'RT-2k'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
%% long movie reviews | ||
|
||
params.CVNUM = 1; | ||
params.doCV = 0; | ||
firsthalf = false(1,50000); | ||
firsthalf(1:25000) = true; | ||
train_ind = firsthalf; | ||
test_ind = ~firsthalf; | ||
|
||
% bigram | ||
load('../data/mrl/bigram_mrl_striponlybi.mat'); | ||
dataset = 'IMDB'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
% unigram | ||
load('../data/mrl/unigram_mrl2.mat'); | ||
dataset = 'IMDB'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
|
||
|
||
%% 20NG ath vs. religion | ||
params.CVNUM = 2; | ||
params.doCV = 0; | ||
params.Cbisvm = 0.1; | ||
|
||
load('../data/20ng/bigram_ng20_atheisms_strip_noheader.mat'); | ||
% Spliting training/testing data randomly | ||
% cv_obj = cvpartition(labels,'kfold',2); | ||
% save('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj') | ||
load('../data/20ng/cv_obj_alt.atheism.mat', 'cv_obj'); | ||
train_ind = cv_obj.training(1); | ||
test_ind = cv_obj.test(1); | ||
|
||
dataset = 'AthR'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/20ng/unigram_ng20_atheisms_strip_noheader.mat'); | ||
dataset = 'AthR'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
%% 20NG windows vs. graphics | ||
params.CVNUM = 2; | ||
params.doCV = 0; | ||
|
||
load('../data/20ng/bigram_ng20_windows_strip_noheader.mat'); | ||
% Spliting training/testing data randomly | ||
% cv_obj = cvpartition(labels,'kfold',2); | ||
% save('../data/20ng/cv_obj_windows.mat', 'cv_obj') | ||
load('../data/20ng/cv_obj_windows.mat', 'cv_obj'); | ||
|
||
train_ind = cv_obj.training(1); | ||
test_ind = cv_obj.test(1); | ||
dataset = 'XGraph'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/20ng/unigram_ng20_windows_strip_noheader.mat'); | ||
dataset = 'XGraph'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
%% 20NG base vs. crpyt | ||
params.CVNUM = 2; | ||
params.doCV = 0; | ||
|
||
load('../data/20ng/bigram_ng20_baseball_strip_noheader.mat'); | ||
% Spliting training/testing data randomly | ||
% cv_obj = cvpartition(labels,'kfold',2); | ||
% save('../data/20ng/cv_obj_baseball.mat', 'cv_obj') | ||
load('../data/20ng/cv_obj_baseball.mat', 'cv_obj'); | ||
|
||
train_ind = cv_obj.training(1); | ||
test_ind = cv_obj.test(1); | ||
|
||
dataset = 'BbCrypt'; | ||
gram = 'Bi'; | ||
masterCV; | ||
|
||
load('../data/20ng/unigram_ng20_baseball_strip_noheader.mat'); | ||
dataset = 'BbCrypt'; | ||
gram = 'Uni'; | ||
masterCV; | ||
|
||
|
||
save(['allresults_' datestr(now)], 'allresults'); | ||
printTable(allresults) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
params.dictsize = length(wordsbi); | ||
params.numcases = length(labels); | ||
|
||
cdataset = dataset; | ||
c = 1; | ||
if ~exist('pdataset', 'var') || ~strcmp(cdataset, pdataset) | ||
currentres = zeros(2,3); | ||
pdataset = cdataset; | ||
end | ||
|
||
if ~isfield(params, 'doCV') | ||
params.doCV = 1; | ||
end | ||
|
||
trainfuncp = @(allSNumBi, labels, params) trainMNB(allSNumBi, labels, params); | ||
testfuncp = @(model, allSNumBi, labels, params) testMNB(model, allSNumBi, labels, params); | ||
|
||
if params.doCV | ||
CV; | ||
else | ||
trainTest; | ||
end | ||
|
||
writeResults(allcounts, params, [dataset '-SVM-' gram]); | ||
allresults([dataset, '-MNB-', gram]) = mean(allcounts); | ||
% trainfuncp = @(allSNumBi, labels, params) trainBNB(allSNumBi, labels, params); | ||
% testfuncp = @(model, allSNumBi, labels, params) testBNB(model, allSNumBi, labels, params); | ||
% | ||
% if params.doCV | ||
% CV; | ||
% else | ||
% trainTest; | ||
% end | ||
% | ||
% writeResults(allcounts, params, [dataset '-BNB-' gram]); | ||
% allresults([dataset, '-BNB-', gram]) = mean(allcounts); | ||
trainfuncp = @(allSNumBi, labels, params) trainbisvm(allSNumBi, labels, params); | ||
testfuncp = @(model, allSNumBi, labels, params) testbisvm(model, allSNumBi, labels, params); | ||
|
||
if params.doCV | ||
CV; | ||
else | ||
trainTest; | ||
end | ||
|
||
writeResults(allcounts, params, [dataset '-SVM-' gram]); | ||
allresults([dataset, '-SVM-', gram]) = mean(allcounts); | ||
|
||
|
||
trainfuncp = @(allSNumBi, labels, params) trainMNBSVM(allSNumBi, labels, params); | ||
testfuncp = @(model, allSNumBi, labels, params) testMNBSVM(model, allSNumBi, labels, params); | ||
if params.doCV | ||
CV; | ||
else | ||
trainTest; | ||
end | ||
|
||
writeResults(allcounts, params, [dataset '-NBSVM-' gram]); | ||
allresults([dataset, '-NBSVM-', gram]) = mean(allcounts); |
Oops, something went wrong.