In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.datasets import fetch_20newsgroups
data=fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
train=fetch_20newsgroups(subset='train',categories=data.target_names)
test=fetch_20newsgroups(subset='test',categories=data.target_names)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
X_train=vector.fit_transform(train.data)
X_train.shape

(11314, 130107)

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=TfidfTransformer()
X_train_tfidf=vectorizer.fit_transform(X_train)
X_train_tfidf.shape
X_test=vector.transform(test.data)
X_test_tfidf=vectorizer.transform(X_test)


In [5]:
from sklearn.tree import DecisionTreeClassifier
lr_model= DecisionTreeClassifier()
lr_model=lr_model.fit(X_train_tfidf,train.target)

In [6]:
X_test=vector.transform(test.data)
X_test_tfidf=vectorizer.transform(X_test)
predicted= lr_model.predict(X_test_tfidf)

In [7]:
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print("accuracy:",accuracy_score(test.target,predicted))


accuracy: 0.5529739776951673


In [8]:
print("confusion matrix:",test.target,predicted)

confusion matrix: [ 7  5  0 ...  9  6 15] [ 4 12 15 ...  0 12 15]


In [9]:
print(classification_report(test.target,predicted,target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.47      0.48      0.47       319
           comp.graphics       0.45      0.43      0.44       389
 comp.os.ms-windows.misc       0.50      0.55      0.52       394
comp.sys.ibm.pc.hardware       0.45      0.42      0.43       392
   comp.sys.mac.hardware       0.49      0.56      0.52       385
          comp.windows.x       0.49      0.46      0.47       395
            misc.forsale       0.66      0.74      0.70       390
               rec.autos       0.62      0.60      0.61       396
         rec.motorcycles       0.73      0.76      0.75       398
      rec.sport.baseball       0.53      0.57      0.55       397
        rec.sport.hockey       0.68      0.67      0.67       399
               sci.crypt       0.73      0.69      0.71       396
         sci.electronics       0.35      0.33      0.34       393
                 sci.med       0.50      0.43      0.46       396
         

In [10]:
import nltk
nltk.download('stopwords')
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w).lower() for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_dtc_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('svc',DecisionTreeClassifier())])

text_dtc_stemmed = text_dtc_stemmed.fit(train.data, train.target)

predicted_dtc_stemmed = text_dtc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_dtc_stemmed))

[nltk_data] Downloading package stopwords to /home/shanni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


accuracy: 0.5699681359532661


In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class LemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w).lower() for w in analyzer(doc)])
    
lemmed_count_vect = LemmedCountVectorizer(stop_words='english')

text_dtc_stemmed = Pipeline([('vect', lemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('svc', DecisionTreeClassifier())])

text_dtc_stemmed = text_dtc_stemmed.fit(train.data, train.target)

predicted_dtc_stemmed = text_dtc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_dtc_stemmed))

[nltk_data] Downloading package wordnet to /home/shanni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
