# Baseline

Attempt to run the classification task using "traditional" pipeline.

In [1]:
from __future__ import division, print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import logging
%matplotlib inline

In [2]:
DATA_DIR = "../data"

VOCAB_SIZE = 40000
NUM_CLASSES = 20

logging.basicConfig()

## Data Generation

In [3]:
ng_data = fetch_20newsgroups(subset='all',
                             data_home=DATA_DIR,
                             shuffle=True, 
                             random_state=42)
num_docs = len(ng_data.data)
print(num_docs)

18846


In [4]:
cvec = CountVectorizer(max_features=VOCAB_SIZE)
tfidf = TfidfTransformer()
Xc = cvec.fit_transform(ng_data.data)
X = tfidf.fit_transform(Xc)
y = np.array(ng_data.target)
print(X.shape, y.shape)

(18846, 40000) (18846,)


In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7)
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

(13192, 40000) (13192,) (5654, 40000) (5654,)


## Naive Bayes Classifier

In [6]:
clf = MultinomialNB()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.859


## Support Vector Machine Classifier

In [7]:
clf = SGDClassifier()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.918
