In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import numpy as np

In [2]:
categories = ['rec.autos', 'sci.electronics', 'comp.graphics',
              'alt.atheism','sci.space']
data_train = fetch_20newsgroups(subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=2017,
                                remove=('headers','footers','quotes'))
data_test = fetch_20newsgroups(subset='test',
                                categories=categories,
                                shuffle=True,
                                random_state=2017,
                                remove=('headers','footers','quotes'))

In [3]:
y_train = data_train.target
y_test = data_test.target

In [4]:
vectorizer = TfidfVectorizer(sublinear_tf=True,
                            smooth_idf=True,
                            max_df=0.5,
                            ngram_range=(1,2),
                            stop_words='english')

In [5]:
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

In [6]:
print("Train Dataset")
print("%d documents" % len(data_train.data))
print("%d categories" % len(data_train.target_names))
print("Number of samples: %d, number of features: %d" % X_train.shape)

Train Dataset
2842 documents
5 categories
Number of samples: 2842, number of features: 219812


In [7]:
print("Test Dataset")
print("%d documents" % len(data_test.data))
print("%d categories" % len(data_test.target_names))
print("Number of samples: %d, number of features: %d" % X_test.shape)

Test Dataset
1891 documents
5 categories
Number of samples: 1891, number of features: 219812


## Naive Bayes Classifier

In [8]:
# Naive Bayes classification model:
from sklearn.naive_bayes import MultinomialNB

In [9]:
clf_NB = MultinomialNB()
clf_NB = clf_NB.fit(X_train, y_train)

In [10]:
y_train_pred = clf_NB.predict(X_train)
y_test_pred = clf_NB.predict(X_test)

In [11]:
print("Train accuracy score:", metrics.accuracy_score(y_train, y_train_pred))
print("Test accuracy score:", metrics.accuracy_score(y_test, y_test_pred))

Train accuracy score: 0.975721323012
Test accuracy score: 0.804336329984


In [12]:
#classification report
print("Classification Report(Train)")
print(metrics.classification_report(y_train, y_train_pred))
print("Classification Report(Test)")
print(metrics.classification_report(y_test, y_test_pred))

Classification Report(Train)
             precision    recall  f1-score   support

          0       1.00      0.97      0.98       480
          1       0.99      0.97      0.98       584
          2       0.90      1.00      0.95       594
          3       1.00      0.97      0.98       591
          4       1.00      0.97      0.98       593

avg / total       0.98      0.98      0.98      2842

Classification Report(Test)
             precision    recall  f1-score   support

          0       0.93      0.70      0.80       319
          1       0.79      0.87      0.83       389
          2       0.76      0.89      0.82       396
          3       0.87      0.69      0.77       393
          4       0.75      0.85      0.80       394

avg / total       0.82      0.80      0.80      1891



## Support Vector Machines

In [13]:
from sklearn.svm import SVC

In [14]:
svm_cls = SVC(kernel='rbf', gamma=1.0,C=1000)
svm_cls = svm_cls.fit(X_train, y_train)

In [15]:
y_train_pred = svm_cls.predict(X_train)
y_test_pred = svm_cls.predict(X_test)

In [16]:
print("Train accuracy score:", metrics.accuracy_score(y_train, y_train_pred))
print("Test accuracy score:", metrics.accuracy_score(y_test, y_test_pred))

Train accuracy score: 0.978184377199
Test accuracy score: 0.797461660497


In [17]:
#classification report
print("Classification Report(Train)")
print(metrics.classification_report(y_train, y_train_pred))
print("Classification Report(Test)")
print(metrics.classification_report(y_test, y_test_pred))

Classification Report(Train)
             precision    recall  f1-score   support

          0       1.00      0.97      0.99       480
          1       1.00      0.97      0.99       584
          2       0.91      1.00      0.95       594
          3       1.00      0.97      0.99       591
          4       1.00      0.97      0.99       593

avg / total       0.98      0.98      0.98      2842

Classification Report(Test)
             precision    recall  f1-score   support

          0       0.88      0.75      0.81       319
          1       0.80      0.84      0.82       389
          2       0.77      0.87      0.82       396
          3       0.78      0.73      0.75       393
          4       0.79      0.79      0.79       394

avg / total       0.80      0.80      0.80      1891



## Reference:

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html