naive_bayes

In [25]:
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import density
from sklearn import metrics

from sklearn.externals import joblib

categories = None

remove = ('headers', 'footers', 'quotes')
    
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

target_names = data_train.target_names

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)


print("n_samples: %d, n_features: %d" % X_train.shape)
print()


X_test = vectorizer.transform(data_test.data)


print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string

feature_names = vectorizer.get_feature_names()


if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# #############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    s = pickle.dumps(clf)
    
    joblib.dump(clf, 'filename.pkl') 
    
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

   
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=target_names))


    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

results = []

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")


benchmark(MultinomialNB(alpha=.01))
# results.append(benchmark(BernoulliNB(alpha=.01)))



Extracting features from the training data using a sparse vectorizer
n_samples: 11314, n_features: 101322
()
n_samples: 7532, n_features: 101322
()
Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.126s
test time:  0.045s
accuracy:   0.696
dimensionality: 101322
density: 1.000000
classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.56      0.43      0.49       319
           comp.graphics       0.65      0.71      0.68       389
 comp.os.ms-windows.misc       0.75      0.46      0.57       394
comp.sys.ibm.pc.hardware       0.59      0.72      0.65       392
   comp.sys.mac.hardware       0.71      0.69      0.70       385
          comp.windows.x       0.79      0.75      0.77       395
            misc.forsale       0.81      0.72      0.76       390
               rec.autos       0.76    

('MultinomialNB',
 0.69649495485926716,
 0.12599992752075195,
 0.04499983787536621)