In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.datasets import fetch_20newsgroups
data=fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
train=fetch_20newsgroups(subset='train',categories=data.target_names)
test=fetch_20newsgroups(subset='test',categories=data.target_names)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
X_train=vector.fit_transform(train.data)
X_train.shape

(11314, 130107)

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=TfidfTransformer()
X_train_tfidf=vectorizer.fit_transform(X_train)
X_train_tfidf.shape
X_test=vector.transform(test.data)
X_test_tfidf=vectorizer.transform(X_test)

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_tfidf,train.target)
X_test=vector.transform(test.data)
X_test_tfidf=vectorizer.transform(X_test)


In [6]:
predicted = rf_clf.predict(X_test)
predicted

array([ 6,  1,  0, ...,  9,  3, 15])

In [7]:
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print("accuracy:",accuracy_score(test.target,predicted))
print(metrics.classification_report(test.target, predicted))

accuracy: 0.7391131173659055
              precision    recall  f1-score   support

           0       0.65      0.65      0.65       319
           1       0.60      0.68      0.64       389
           2       0.66      0.63      0.65       394
           3       0.62      0.60      0.61       392
           4       0.63      0.76      0.69       385
           5       0.85      0.67      0.75       395
           6       0.74      0.90      0.81       390
           7       0.74      0.83      0.78       396
           8       0.92      0.88      0.90       398
           9       0.84      0.88      0.86       397
          10       0.88      0.92      0.90       399
          11       0.81      0.92      0.86       396
          12       0.71      0.42      0.53       393
          13       0.86      0.61      0.72       396
          14       0.85      0.84      0.84       394
          15       0.57      0.94      0.71       398
          16       0.66      0.85      0.74       36

In [8]:
print("confusion matrix:",test.target,predicted)

confusion matrix: [ 7  5  0 ...  9  6 15] [ 6  1  0 ...  9  3 15]


In [11]:
import nltk
nltk.download('stopwords')
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w).lower() for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_rfc_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('rfc', RandomForestClassifier())])

text_rfc_stemmed = text_rfc_stemmed.fit(train.data, train.target)

predicted_rfc_stemmed = text_rfc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_rfc_stemmed))

[nltk_data] Downloading package stopwords to /home/shanni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


accuracy: 0.7851832182687202


In [12]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class LemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w).lower() for w in analyzer(doc)])
    
lemmed_count_vect = LemmedCountVectorizer(stop_words='english')

text_rfc_stemmed = Pipeline([('vect', lemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb',RandomForestClassifier())])

text_rfc_stemmed = text_rfc_stemmed.fit(train.data, train.target)

predicted_rfc_stemmed = text_rfc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_rfc_stemmed))

[nltk_data] Downloading package wordnet to /home/shanni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


accuracy: 0.7869091874668083
