In [19]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

In [20]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [21]:
text = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state = 42)

In [22]:
# len(text.data)
type(text.data)

list

In [23]:
len(text.target)

2257

In [24]:
text.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [25]:
print("\n".join( text.data[0].split('\n')[0:3] ))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [26]:
print(text.target_names[text.target[0]])

comp.graphics


# Count Sparse Matrix 

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_count = count_vect.fit_transform(text.data)
X_count.shape

(2257, 35788)

# TF-IDF Sparse Matrix

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_count)
X_train_tfidf.shape

(2257, 35788)

# Naive Bayes Text Classifier

In [30]:
from sklearn.naive_bayes import MultinomialNB

# Training
clf = MultinomialNB().fit(X_train_tfidf, text.target)

# Testing
docs_test = ['God is love', 'OpenGL on the GPU is fast']
X_test_count = count_vect.transform(docs_test)
X_test_tfidf = tfidf_transformer.transform(X_test_count)

print(X_test_tfidf.shape)

predicted = clf.predict(X_test_tfidf)

i=0
for p in predicted:
    print (docs_test[i] + ": " + text.target_names[p])
    i += 1

(2, 35788)
God is love: soc.religion.christian
OpenGL on the GPU is fast: comp.graphics


# Stemmer

In [31]:
from nltk.stem.porter import *
analyzer = CountVectorizer().build_analyzer()

stemmer = PorterStemmer()

documents = ['trying', 'going']
documents = [[stemmer.stem(word) for word in sentence.split(" ")] for sentence in documents]
documents

[[u'tri'], [u'go']]

# Building a pipeline - Naive Bayes

In [32]:
from sklearn.pipeline import Pipeline

# Training
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(text.data, text.target)

# Evaluation
text_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
predicted = text_clf.predict(text_test.data)
np.mean(predicted == text_test.target)

0.83488681757656458

# SVM Classifier

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

# Training
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

text_clf = text_clf.fit(text.data, text.target)

# Evaluation
text_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
predicted = text_clf.predict(text_test.data)
np.mean(predicted == text_test.target)

0.9127829560585885

# Performance Metrics


In [41]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))
print(metrics.confusion_matrix(twenty_test.target, predicted))

AttributeError: 'list' object has no attribute 'target'

# Grid Search - Parameter Turing

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

clf = GridSearchCV(estimator = text_clf, param_grid = parameters, n_jobs = -1)
clf
clf.fit(text.data[0:400], text.target[0:400])



GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [None]:

clf.best_params_

In [None]:
print(clf.predict(['card']))