# Baseline

Attempt to run the classification task using "traditional" pipeline.

In [1]:
from __future__ import division, print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import logging
%matplotlib inline

In [2]:
DATA_DIR = "../data"

VOCAB_SIZE = 40000
NUM_CLASSES = 20

logging.basicConfig()

## Data Generation

In [3]:
ng_data = fetch_20newsgroups(subset='all',
                             data_home=DATA_DIR,
                             shuffle=True, 
                             random_state=42)
num_docs = len(ng_data.data)
print(num_docs)

18846


In [25]:
cvec = CountVectorizer(max_features=VOCAB_SIZE)
tfidf = TfidfTransformer()
Xc = cvec.fit_transform(ng_data.data)
X = tfidf.fit_transform(Xc)
print(X[0,:])
y = np.array(ng_data.target)
print(X.shape, y.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(18846, 40000) (18846,)


In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7)
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

(13192, 40000) (13192,) (5654, 40000) (5654,)


## Naive Bayes Classifier

In [6]:
clf = MultinomialNB()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.863


## Support Vector Machine Classifier

In [7]:
clf = SGDClassifier()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.918


In [30]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
corpus = ['This is the first document.',
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?']
vectorizer=CountVectorizer(max_features=50)
 
 
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) 
feature_name = vectorizer.get_feature_names()
print (tfidf)
print (feature_name)


  (0, 8)	0.4387767428592343
  (0, 6)	0.35872873824808993
  (0, 3)	0.4387767428592343
  (0, 2)	0.5419765697264572
  (0, 1)	0.4387767428592343
  (1, 8)	0.2723014675233404
  (1, 6)	0.22262429232510395
  (1, 5)	0.8532257361452786
  (1, 3)	0.2723014675233404
  (1, 1)	0.2723014675233404
  (2, 7)	0.5528053199908667
  (2, 6)	0.2884767487500274
  (2, 4)	0.5528053199908667
  (2, 0)	0.5528053199908667
  (3, 8)	0.4387767428592343
  (3, 6)	0.35872873824808993
  (3, 3)	0.4387767428592343
  (3, 2)	0.5419765697264572
  (3, 1)	0.4387767428592343
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
