# Baseline

Attempt to run the classification task using "traditional" pipeline.

In [2]:
from __future__ import division, print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import logging
%matplotlib inline

In [17]:
DATA_DIR = "../data"

VOCAB_SIZE = 40000
NUM_CLASSES = 20

logging.basicConfig()

## Data Generation

In [18]:
ng_data = fetch_20newsgroups(subset='all',
                             data_home=DATA_DIR,
                             shuffle=True, 
                             random_state=42)
num_docs = len(ng_data.data)
print(num_docs)

18846


In [19]:
cvec = CountVectorizer(max_features=VOCAB_SIZE)
tfidf = TfidfTransformer()
Xc = cvec.fit_transform(ng_data.data)
X = tfidf.fit_transform(Xc)
print(X[0,:])
y = np.array(ng_data.target)
print(X.shape, y.shape)

  (0, 29802)	0.0229335667118623
  (0, 29359)	0.07229351306358578
  (0, 29231)	0.023420284027698857
  (0, 29089)	0.04100598982152166
  (0, 28840)	0.0715131532669709
  (0, 28824)	0.027997641243580357
  (0, 28386)	0.040641656982876036
  (0, 26981)	0.04352831896562306
  (0, 26920)	0.08217024456107144
  (0, 26732)	0.05426803993938286
  (0, 26729)	0.04155072621954815
  (0, 26674)	0.02855770615528225
  (0, 26613)	0.15827546781424923
  (0, 26598)	0.07190066656091562
  (0, 25920)	0.047020362381520096
  (0, 25679)	0.014804828580768283
  (0, 25379)	0.08166688190726255
  (0, 24905)	0.03205952306046901
  (0, 24516)	0.04434883441927549
  (0, 24393)	0.0781067894289065
  (0, 24382)	0.039721644015765
  (0, 23951)	0.040964551928748004
  (0, 23909)	0.14099793385384446
  (0, 23361)	0.07291696638547596
  (0, 22502)	0.1161938725019107
  :	:
  (0, 10248)	0.05369216248809362
  (0, 9935)	0.0419653895304571
  (0, 9103)	0.1020476893513173
  (0, 8887)	0.12257153973018296
  (0, 8885)	0.17393588261421533
  (0, 7716

In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7)
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

(13192, 40000) (13192,) (5654, 40000) (5654,)


## Naive Bayes Classifier

In [6]:
clf = MultinomialNB()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.863


## Support Vector Machine Classifier

In [7]:
clf = SGDClassifier()
clf.fit(Xtrain, ytrain)
ytest_ = clf.predict(Xtest)
print("accuracy: {:.3f}".format(accuracy_score(ytest, ytest_)))

accuracy: 0.918


In [24]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
corpus = ['This is the first document document.',
          'This is the second second second document. gg',
          'And the third one.',
          'Is this the first document?']
vectorizer=CountVectorizer(max_features=20)
print('(第i句話, 在feature_name裡的第j個字)     在第i句話裡出現n次')
print(vectorizer.fit_transform(corpus))
print('------------')
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) 
feature_name = vectorizer.get_feature_names()
print (tfidf.shape)
print (feature_name)


(第i句話, 在feature_name裡的第j個字)     在第i句話裡出現n次
  (0, 1)	2
  (0, 2)	1
  (0, 7)	1
  (0, 4)	1
  (0, 9)	1
  (1, 3)	1
  (1, 6)	3
  (1, 1)	1
  (1, 7)	1
  (1, 4)	1
  (1, 9)	1
  (2, 5)	1
  (2, 8)	1
  (2, 0)	1
  (2, 7)	1
  (3, 1)	1
  (3, 2)	1
  (3, 7)	1
  (3, 4)	1
  (3, 9)	1
------------
(4, 10)
['and', 'document', 'first', 'gg', 'is', 'one', 'second', 'the', 'third', 'this']
