# News Article Text Classification

    #Data Import

In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset="train", shuffle = True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
type(twenty_train)

sklearn.utils.Bunch

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [14]:
print("\n".join(twenty_train.data[0].split("\n")[:5]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15


    #Feature Extraction

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer()
X_train_counts = countvect.fit_transform(raw_documents=twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [18]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [19]:
X_train_counts

<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

    #Machine Learning Model

In [21]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)


In [23]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

    #Support Vector Machine

In [30]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect_svm',CountVectorizer()), ('tfidf_svm', TfidfTransformer()), ('clf_svm', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5, random_state=42))])
text_clf_svm.fit(twenty_train.data, twenty_train.target)



Pipeline(memory=None,
     steps=[('vect_svm', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        s...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [31]:
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8238183749336165

    #Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect_rf',CountVectorizer()), ('tfidf_rf', TfidfTransformer()), ('clf_rf', RandomForestClassifier(n_estimators = 300, max_features='auto', criterion = "entropy", random_state = 0, oob_score =False))])
text_clf_rf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect_rf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...stimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [35]:
predicted_rf = text_clf_rf.predict(twenty_test.data)
np.mean(predicted_rf == twenty_test.target)

0.6992830589484864

    #GridsearchCV for NB clf

In [36]:
from sklearn.model_selection import GridSearchCV
parameter = {'vect__ngram_range': [(1,1),(1,2)],'tfidf__use_idf':(True,False), 'clf__alpha': (1e-2, 1e-3)}


In [37]:
gs_clf = GridSearchCV(text_clf, parameter)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [38]:
gs_clf.best_score_
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [39]:
gs_clf.best_score_

0.9067526957751458

In [None]:
import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
