# Document Classifier Analysis
#### Word Count, Word Presence, TF-IDF, Bi-Gram, Wang - Manning, LSA, GloVe, % of training data 

Raj Sundhar Ravichandran, CS 533, Spring 2018.

This notebook helps to analyse various Machine Learning classification algorithms with different features on both similar and different documents.

In [1]:
from __future__ import print_function
import os
import nltk
import re
import itertools
import vocabulary
import newsreader
import numpy as np
import scipy
import sklearn
from sklearn.preprocessing import StandardScaler
try:
    import cPickle as pickle
except:
    import pickle
    
    
vocab_file, vocab_file_type = "20news-bydate-vocab.pkl", "pickle"

embedding_file, embedding_dimensions, embedding_cache = \
    "glove6B50/glove.6B.50d.txt", 50, "20news-bydate-embedding.npz"

all_data, train_dir, dev_dir, test_dir = \
    "20-news-bydate", "20-news-bydate/20news-bydate-train/", None, "20-news-bydate/20news-bydate-test/"

has_bad_metadata = True

### Build Vocabulary and Word Embeddings

In [2]:
made_vocabulary = False
if made_vocabulary :
    v = vocabulary.Vocabulary.load(vocab_file, file_type=vocab_file_type)
else: 
    tokens = newsreader.all_textfile_tokens(all_data, strip_metadata=has_bad_metadata)                                            
    v = vocabulary.Vocabulary.from_iterable(tokens, file_type=vocab_file_type)
    v.save(vocab_file)
v.stop_growth()

made_embedding = False
if made_embedding :
    e = newsreader.load_sparse_csr(embedding_cache)
else: 
    e = newsreader.build_sparse_embedding(v, embedding_file, embedding_dimensions)
    newsreader.save_sparse_csr(embedding_cache, e)

140907 words were not in glove


### Class Types to make comparison

In [3]:
class1, class2 = "rec.sport.hockey", "comp.windows.x"

In [4]:
targets = []
def selected(name) :
    if not targets:
        return True
    if any(t.startswith(name) for t in targets) :
        return True
    return False

### Build DataManager with Number of Counts for Each Words

In [5]:
def use_default_features(vocab) :
    return lambda data: vocab

def count_features(features, gen_tokens) :
    for t in gen_tokens :
        r = features.add(t)
        if r :
            yield r    
            
count_data = newsreader.DataManager(train_dir + class1,
                                       train_dir + class2,
                                       test_dir + class1,
                                       test_dir + class2,
                                       use_default_features(v),
                                       count_features,
                                       dev_dir + class1 if dev_dir else None,
                                       dev_dir + class2 if dev_dir else None,
                                       strip_metadata=has_bad_metadata)

count_data.initialize(build_cache=True)

### Build DataManager with Presence/Absence (0/1) for each Word

In [6]:
def make_boolean_features(feature_counter) :
    def collect_features(features, gen_tokens) :
        seen = set()
        for f in feature_counter(features, gen_tokens) :
            seen.add(f)
        for f in seen :
            yield f
    return collect_features

boolean_data = newsreader.DataManager(train_dir + class1,
                                         train_dir + class2,
                                         test_dir + class1,
                                         test_dir + class2,
                                         use_default_features(v),
                                         make_boolean_features(count_features),
                                         dev_dir + class1 if dev_dir else None,
                                         dev_dir + class2 if dev_dir else None,
                                         strip_metadata=has_bad_metadata)

boolean_data.initialize(build_cache=True)

### Function to set up Experiments

In [7]:
class Experiment(object) :
    '''Organize the process of getting data, building a classifier,
    and exploring new representations'''
    
    def __init__(self, data, comment, classifier, cdesc) :
        'set up the problem of learning a classifier from a data manager'
        self.data = data
        self.comment = comment
        self.classifier = classifier
        self.cdesc = cdesc
        self.initialized = False
        
    def initialize(self) :
        'materialize the training data, dev data and test data as matrices'
        if not self.initialized :
            self.train_X, self.train_y = self.data.training_data()
            self.dev_X, self.dev_y = self.data.dev_data()
            self.test_X, self.test_y = self.data.test_data()
            self.initialized = True
        
    def fit_and_validate(self, report=True) :
        'train the classifier and assess predictions on dev data'
        if not self.initialized :
            self.initialize()
        self.classifier.fit(self.train_X, self.train_y)
        self.dev_predictions = self.classifier.predict(self.dev_X)
        self.accuracy = sklearn.metrics.accuracy_score(self.dev_y, self.dev_predictions)
        if report :
            print("{}\nclassified by {}\naccuracy {}".format(self.comment, self.cdesc, self.accuracy))
            
    def xval(self, folds=20, report=True) :
        accuracies = []
        for i in range(folds) :
            self.fit_and_validate(report=False)
            accuracies.append(self.accuracy)
        if report :
            msg = "{}\nclassified by {}\naverage accuracy {} (std {})"
            print(msg.format(self.comment, self.cdesc, 
                             sum(accuracies)/folds,
                             np.std(accuracies)))
    
    @classmethod
    def transform(cls, expt, operation, description, classifier, cdesc) :
        'use operation to transform the data from expt and set up new classifier'
        if not expt.initialized :
            expt.initialize()
        result = cls(expt.data, expt.comment + '\n' + description, classifier, cdesc)
        result.train_X, result.train_y = operation(expt.train_X, expt.train_y, 'train')
        result.dev_X, result.dev_y = operation(expt.dev_X, expt.dev_y, 'dev')
        result.test_X, result.test_y = operation(expt.test_X, expt.test_y, 'test')
        result.initialized = True
        return result

### Calculation of Wang - Manning Weights

[Based on ACL 2012 paper by Sida Wang and Chris Manning][1]

[1]:http://aclweb.org/anthology/P/P12/P12-2018.pdf

In [8]:
def wang_manning_weights(expt) :
    Xyes = expt.train_X[expt.train_y ==1, :]
    Xno = expt.train_X[expt.train_y != 1, :] 
    yesrates = np.log((Xyes.getnnz(axis=0) + 1.) / Xyes.shape[1])
    norates = np.log((Xno.getnnz(axis=0) + 1.) / Xno.shape[1])
    W = scipy.sparse.diags(yesrates - norates, 0)
    return lambda X, y, c: (X.dot(W), y)

### TF-IDF Weight Calculations

In [9]:
def idf_weights(expt) :
    idf = np.log((expt.train_X.shape[1] + 1.) / (expt.train_X.getnnz(axis=0) + 1.))
    W = scipy.sparse.diags(idf, 0)
    return lambda X, y, c: (X.dot(W), y)

### Include Glove Embeddings and SVD Dimensionality Reduction

In [10]:
def add_embeddings(expt, embeddings, scale=True, stack=True) :
    extra_features = expt.train_X.shape[1] - embeddings.shape[0]
    if extra_features > 0 :
        Z = scipy.sparse.csr_matrix((extra_features, embeddings.shape[1]))
        W = scipy.sparse.vstack([embeddings, Z])
    else: 
        W = embeddings
    if scale :
        scaler = StandardScaler(with_mean=False)
        scaler.fit(expt.train_X.dot(W))
    def operation(X, y, s) :
        if scale:
            new_features = scaler.transform(X.dot(W))
        else :
            new_features = X.dot(W)
        if stack :
            all_features = scipy.sparse.hstack([X, new_features]).tocsr()
        else :
            all_features = new_features
        return (all_features, y)
    return operation

def dimensionality_reduction(expt, dimensions) :
    _, _, wrt = scipy.sparse.linalg.svds(expt.train_X, k=dimensions, 
                                         return_singular_vectors='vh')
    return add_embeddings(expt, np.transpose(wrt), stack=False)

### Include and Calculate bigram feature Counts

In [11]:
def use_bigram_features(data) :
    f = vocabulary.Vocabulary.load(vocab_file, file_type=vocab_file_type)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    word_fd = nltk.FreqDist(data.all_train_tokens())
    bigram_fd = nltk.FreqDist(nltk.bigrams(data.all_train_tokens()))
    finder = nltk.collocations.BigramCollocationFinder(word_fd, bigram_fd)
    finder.apply_freq_filter(5)
    pmi = finder.score_ngrams(bigram_measures.pmi)
    collocations = [ (x,y) for x,y in pmi if y > 0 ]
    for (w1, w2), _ in collocations:
        f.add(w1 + " " + w2)
    f.stop_growth()
    return f

def count_bigram_features(features, gen_tokens) :
    prev = None
    for t in gen_tokens :
        r = features.add(t)
        if r :
            yield r
            if prev :
                r = features.add(prev + " " + t)
                if r : 
                    yield r
            prev = t
            
bigram_data = newsreader.DataManager(train_dir + class1,
                                       train_dir + class2,
                                       test_dir + class1,
                                       test_dir + class2,
                                       use_bigram_features,
                                       make_boolean_features(count_bigram_features),
                                       dev_dir + class1 if dev_dir else None,
                                       dev_dir + class2 if dev_dir else None,
                                       strip_metadata=has_bad_metadata)

bigram_data.initialize(build_cache=True)

### Learning with only certain percentage of the total train data

In [12]:
def limit_training(percent) :
    def operation(X, y, s) :
        if s != 'train' :
            return (X, y)
        data_to_take = int(X.shape[0] * percent)
        indices = np.random.choice(X.shape[0], 
                                      size=data_to_take,
                                      replace=False)
        return (X[indices,:], y[indices])
    return operation

# Start of Experiments

### Logistic Regression with Number of Counts for Each Words

In [13]:
if selected("expt_10_"):
    expt_10_ = Experiment(count_data,
                       "{}: {} vs {}, using word count features".format(all_data, class1, class2),
                       sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                       "logistic regression")
    expt_10_.initialize()
    expt_10_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
classified by logistic regression
average accuracy 0.8772727272727273 (std 0.017647884050157735)


### Logistic Regression with Booalean value - Presence/Absence for Each Words

In [14]:
if selected("expt_11_") :
    expt_11_ = Experiment(boolean_data,
                         "{}: {} vs {}, using word presence/absence features".format(all_data, class1, class2),
                         sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                         "logistic regression")
    expt_11_.initialize()
    expt_11_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
classified by logistic regression
average accuracy 0.9265151515151514 (std 0.010285561542634498)


### Logistic Regression with TF-IDF Counts

In [15]:
if selected("expt_12_") :
    expt_12_ = Experiment.transform(expt_10_,
                             idf_weights(expt_10_),
                             "features weighted by inverse document frequency",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_12_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
classified by logistic regression
average accuracy 0.8772727272727272 (std 0.01698502194020227)


### Bi-Gram 

In [16]:
if selected("expt_13_") :
    expt_13_ = Experiment(bigram_data,
                   "{}: {} vs {}, using word and bigram presence/absence features".format(all_data, class1, class2),
                   sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                   "logistic regression")
    expt_13_.initialize()
    expt_13_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
classified by logistic regression
average accuracy 0.9247474747474748 (std 0.009569341071462834)


### Number of Counts + Wang - Manning Weights

In [17]:
if selected("expt_10_100_") :
    expt_10_100_ = Experiment.transform(expt_10_,
                             wang_manning_weights(expt_10_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_10_100_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9330808080808082 (std 0.011722721604479965)


### Boolean (Presence/Absence) + Wang - Manning Weights

In [18]:
if selected("expt_11_101_") :
    expt_11_101_ = Experiment.transform(expt_11_,
                             wang_manning_weights(expt_11_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_11_101_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9436868686868685 (std 0.00559558580963644)


### TF-IDF + Wang - Manning Weights

In [19]:
if selected("expt_12_102_") :
    expt_12_102_ = Experiment.transform(expt_12_,
                             wang_manning_weights(expt_12_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_12_102_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9358585858585858 (std 0.015410248790295317)


### Bi-Gram + Wang - Manning Weights

In [20]:
if selected("expt_13_103_") :
    expt_13_103_ = Experiment.transform(expt_13_,
                             wang_manning_weights(expt_13_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_13_103_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9502525252525252 (std 0.004863979869762186)


### Number of Counts + GloVe

In [21]:
if selected("expt_10_200_") :
    expt_10_200_ = Experiment.transform(expt_10_,
                             add_embeddings(expt_10_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_10_200_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
enriched via word embeddings
classified by logistic regression
average accuracy 0.8722222222222221 (std 0.01447126139585293)


### Boolean (Presence/Absence) + GloVe

In [22]:
if selected("expt_11_201_") :
    expt_11_201_ = Experiment.transform(expt_11_,
                             add_embeddings(expt_11_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_11_201_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
enriched via word embeddings
classified by logistic regression
average accuracy 0.9404040404040405 (std 0.011360830181012126)


### TF-IDF + GloVe

In [23]:
if selected("expt_12_202_") :
    expt_12_202_ = Experiment.transform(expt_12_,
                             add_embeddings(expt_12_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_12_202_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
enriched via word embeddings
classified by logistic regression
average accuracy 0.8770202020202019 (std 0.020345136999554963)


### Bi-Gram + GloVe

In [24]:
if selected("expt_13_203_") :
    expt_13_203_ = Experiment.transform(expt_13_,
                             add_embeddings(expt_13_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_13_203_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
enriched via word embeddings
classified by logistic regression
average accuracy 0.9371212121212121 (std 0.008522493449841514)


### Number of Counts + LSA (SVD)

In [25]:
if selected ("expt_10_300_") :
    expt_10_300_ = Experiment.transform(expt_10_,
                             dimensionality_reduction(expt_10_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_10_300_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9030303030303033 (std 0.008541179053175654)


### Boolean (Presence/Absence) + LSA (SVD)

In [26]:
if selected ("expt_11_301_") :
    expt_11_301_ = Experiment.transform(expt_11_,
                             dimensionality_reduction(expt_11_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_11_301_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9265151515151515 (std 0.0064825745722189064)


### TF-IDF + LSA (SVD)

In [27]:
if selected ("expt_12_302_") :
    expt_12_302_ = Experiment.transform(expt_12_,
                             dimensionality_reduction(expt_12_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_12_302_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9098484848484848 (std 0.009619190381182268)


### Bi-Gram + LSA (SVD)

In [28]:
if selected ("expt_13_303_") :
    expt_13_303_ = Experiment.transform(expt_13_,
                             dimensionality_reduction(expt_13_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_13_303_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9156565656565657 (std 0.005321037248915526)


### Number of Counts + GloVe + LSA (SVD)

In [29]:
if selected ("expt_10_200_100_") :
    expt_10_200_100_ = Experiment.transform(expt_10_200_,
                             dimensionality_reduction(expt_10_200_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_10_200_100_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
enriched via word embeddings
transformed via LSA(100)
classified by logistic regression
average accuracy 0.906060606060606 (std 0.00836007341275094)


### Boolean (Presence/Absence) + GloVe + LSA (SVD)

In [30]:
if selected ("expt_11_201_101_") :
    expt_11_201_101_ = Experiment.transform(expt_11_201_,
                             dimensionality_reduction(expt_11_201_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_11_201_101_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
enriched via word embeddings
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9315656565656567 (std 0.00297722881882618)


### TF-IDF + GloVe + LSA (SVD)

In [31]:
if selected ("expt_12_202_102_") :
    expt_12_202_102_ = Experiment.transform(expt_12_202_,
                             dimensionality_reduction(expt_12_202_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_12_202_102_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
enriched via word embeddings
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9136363636363635 (std 0.009961152991573735)


### BiGram + GloVe + LSA (SVD)

In [32]:
if selected ("expt_13_203_103_") :
    expt_13_203_103_ = Experiment.transform(expt_13_203_,
                             dimensionality_reduction(expt_13_203_, 100),
                             "transformed via LSA(100)",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                           "logistic regression")
    expt_13_203_103_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
enriched via word embeddings
transformed via LSA(100)
classified by logistic regression
average accuracy 0.9234848484848485 (std 0.004000752403473432)


### Number of Counts + GloVe + Wang - Manning Weights

In [33]:
if selected("expt_10_200_200_") :
    expt_10_200_200_ = Experiment.transform(expt_10_200_,
                             wang_manning_weights(expt_10_200_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_10_200_200_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9237373737373739 (std 0.011394458760281294)


### Boolean (Presence/Absence) + GloVe + Wang - Manning Weights

In [34]:
if selected("expt_11_201_201_") :
    expt_11_201_201_ = Experiment.transform(expt_11_201_,
                             wang_manning_weights(expt_11_201_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_11_201_201_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9436868686868685 (std 0.004863979869762164)


### TF-IDF + GloVe + Wang - Manning Weights

In [35]:
if selected("expt_12_202_202_") :
    expt_12_202_202_ = Experiment.transform(expt_12_202_,
                             wang_manning_weights(expt_12_202_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_12_202_202_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9308080808080808 (std 0.014293910806146363)


### BiGram + GloVe + Wang - Manning Weights

In [36]:
if selected("expt_13_203_203_") :
    expt_13_203_203_ = Experiment.transform(expt_13_203_,
                             wang_manning_weights(expt_13_203_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_13_203_203_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9477272727272726 (std 0.005362818328685694)


### Number of Counts + x% of train data

In [37]:
if selected("expt_10_400_") :
    expt_10_400_ = Experiment.transform(expt_10_,
                             limit_training(0.1),
                             "considering 10% training data",
                            sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_10_400_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
considering 10% training data
classified by logistic regression
average accuracy 0.781060606060606 (std 0.040633031647807745)


### Boolean (Presence/Abscence) + x% of train data

In [38]:
if selected("expt_11_401_") :
    expt_11_401_ = Experiment.transform(expt_11_,
                             limit_training(0.1),
                             "considering 10% training data",
                            sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_11_401_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
considering 10% training data
classified by logistic regression
average accuracy 0.8803030303030303 (std 0.027896897938669653)


### TF-IDF + x% of train data

In [39]:
if selected("expt_12_402_") :
    expt_12_402_ = Experiment.transform(expt_12_,
                             limit_training(0.1),
                             "considering 10% training data",
                            sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_12_402_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
considering 10% training data
classified by logistic regression
average accuracy 0.709848484848485 (std 0.037129800422028326)


### BiGram + x% of train data

In [40]:
if selected("expt_13_403_") :
    expt_13_403_ = Experiment.transform(expt_13_,
                             limit_training(0.1),
                             "considering 10% training data",
                            sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_13_403_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
considering 10% training data
classified by logistic regression
average accuracy 0.8699494949494951 (std 0.024372284760826392)


### Number of Counts  + x% of train data + Wang - Manning Weights

In [41]:
if selected("expt_10_400_100_") :
    expt_10_400_100_ = Experiment.transform(expt_10_400_,
                             wang_manning_weights(expt_10_400_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_10_400_100_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
considering 10% training data
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8957070707070709 (std 0.024393207425703917)


### Boolean (Presence/Absence)  + x% of train data + Wang - Manning Weights

In [42]:
if selected("expt_11_401_101_") :
    expt_11_401_101_ = Experiment.transform(expt_11_401_,
                             wang_manning_weights(expt_11_401_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_11_401_101_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
considering 10% training data
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8984848484848487 (std 0.019091977950945246)


### TF-IDF + x% of train data + Wang - Manning Weights

In [43]:
if selected("expt_12_402_102_") :
    expt_12_402_102_ = Experiment.transform(expt_12_402_,
                             wang_manning_weights(expt_12_402_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_12_402_102_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
considering 10% training data
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8767676767676769 (std 0.030193404399262883)


### Bi-Gram + x% of train data + Wang - Manning Weights

In [44]:
if selected("expt_13_403_103_") :
    expt_13_403_103_ = Experiment.transform(expt_13_403_,
                             wang_manning_weights(expt_13_403_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_13_403_103_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
considering 10% training data
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8838383838383839 (std 0.02856997095703222)


### Number of Counts + x% of train data + GloVe

In [45]:
if selected("expt_10_400_200_") :
    expt_10_400_200_ = Experiment.transform(expt_10_400_,
                             add_embeddings(expt_10_400_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_10_400_200_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
considering 10% training data
enriched via word embeddings
classified by logistic regression
average accuracy 0.8055555555555557 (std 0.0266530804079975)


### Boolean (Presence/Absence) + x% of train data + GloVe

In [46]:
if selected("expt_11_401_201_") :
    expt_11_401_201_ = Experiment.transform(expt_11_401_,
                             add_embeddings(expt_11_401_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_11_401_201_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
considering 10% training data
enriched via word embeddings
classified by logistic regression
average accuracy 0.9194444444444446 (std 0.012212931605397716)


### TF-IDF + x% of train data + GloVe

In [47]:
if selected("expt_12_402_202_") :
    expt_12_402_202_ = Experiment.transform(expt_12_402_,
                             add_embeddings(expt_12_402_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_12_402_202_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
considering 10% training data
enriched via word embeddings
classified by logistic regression
average accuracy 0.7101010101010101 (std 0.03890528317691062)


### Bi-Gram + x% of train data + GloVe

In [48]:
if selected("expt_13_403_203_") :
    expt_13_403_203_ = Experiment.transform(expt_13_403_,
                             add_embeddings(expt_13_403_, e),
                             "enriched via word embeddings",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                            "logistic regression")
    expt_13_403_203_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
considering 10% training data
enriched via word embeddings
classified by logistic regression
average accuracy 0.901767676767677 (std 0.022949252648457704)


### Number of Counts  + x% of train data + GloVe + Wang - Manning Weights

In [49]:
if selected("expt_10_400_100_100_") :
    expt_10_400_100_100_ = Experiment.transform(expt_10_400_200_,
                             wang_manning_weights(expt_10_400_200_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_10_400_100_100_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
considering 10% training data
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8866161616161617 (std 0.03065764729424961)


### Boolean (Presence/Absence)  + x% of train data + GloVe + Wang - Manning Weights

In [50]:
if selected("expt_11_401_101_101_") :
    expt_11_401_101_101_ = Experiment.transform(expt_11_401_201_,
                             wang_manning_weights(expt_11_401_201_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_11_401_101_101_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word presence/absence features
considering 10% training data
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.9010101010101013 (std 0.020786985482077438)


### TF-IDF + x% of train data + GloVe + Wang - Manning Weights

In [51]:
if selected("expt_12_402_102_102_") :
    expt_12_402_102_102_ = Experiment.transform(expt_12_402_202_,
                             wang_manning_weights(expt_12_402_202_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_12_402_102_102_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word count features
features weighted by inverse document frequency
considering 10% training data
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8707070707070705 (std 0.023548572175504587)


### Bi-Gram + x% of train data + GloVe + Wang - Manning Weights

In [52]:
if selected("expt_13_403_103_103_") :
    expt_13_403_103_103_ = Experiment.transform(expt_13_403_203_,
                             wang_manning_weights(expt_13_403_203_),
                             "features weighted by evidence they give of class",
                             sklearn.linear_model.SGDClassifier(loss="log",
                                       penalty="elasticnet",
                                       max_iter=50),
                             "logistic regression")
    expt_13_403_103_103_.xval()

20-news-bydate: rec.sport.hockey vs comp.windows.x, using word and bigram presence/absence features
considering 10% training data
enriched via word embeddings
features weighted by evidence they give of class
classified by logistic regression
average accuracy 0.8916666666666668 (std 0.024766793119504703)
