In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.datasets import fetch_20newsgroups
data=fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
train=fetch_20newsgroups(subset='train',categories=data.target_names)
test=fetch_20newsgroups(subset='test',categories=data.target_names)

In [3]:
print(len(train.data))

11314


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
X_train=vector.fit_transform(train.data)
X_train.shape


(11314, 130107)

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=TfidfTransformer()
X_train_tfidf=vectorizer.fit_transform(X_train)
X_train_tfidf.shape


(11314, 130107)

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

clf=GradientBoostingClassifier(n_estimators=50,verbose=2).fit(X_train_tfidf,train.target)

      Iter       Train Loss   Remaining Time 
         1           2.2026            6.69m
         2           1.9756            6.49m
         3           1.8175            6.32m
         4           1.6980            6.17m
         5           1.6014            6.05m
         6           1.5211            5.92m
         7           1.4468            5.79m
         8           1.3821            5.65m
         9           1.3284            5.51m
        10           1.2773            5.39m
        11           1.2324            5.26m
        12           1.1904            5.12m
        13           1.1540            4.98m
        14           1.1184            4.86m
        15           1.0861            4.73m
        16           1.0549            4.60m
        17           1.0259            4.46m
        18           0.9985            4.33m
        19           0.9755            4.19m
        20           0.9503            4.06m
        21           0.9267            3.93m
        2

In [7]:
X_test=vector.transform(test.data)
X_test_tfidf=vectorizer.transform(X_test)
predicted= clf.predict(X_test_tfidf)

In [8]:
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print("accuracy:",accuracy_score(test.target,predicted))
print("confusion matrix:",test.target,predicted)
print(classification_report(test.target,predicted,target_names=test.target_names))

accuracy: 0.7399097185342538
confusion matrix: [ 7  5  0 ...  9  6 15] [12 12  0 ...  9  3 15]
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.63      0.70       319
           comp.graphics       0.68      0.66      0.67       389
 comp.os.ms-windows.misc       0.70      0.68      0.69       394
comp.sys.ibm.pc.hardware       0.63      0.71      0.67       392
   comp.sys.mac.hardware       0.78      0.78      0.78       385
          comp.windows.x       0.84      0.62      0.71       395
            misc.forsale       0.82      0.84      0.83       390
               rec.autos       0.87      0.72      0.79       396
         rec.motorcycles       0.90      0.86      0.88       398
      rec.sport.baseball       0.92      0.84      0.88       397
        rec.sport.hockey       0.93      0.86      0.89       399
               sci.crypt       0.90      0.81      0.85       396
         sci.electronics       0.29      0.70 

In [9]:
import nltk
nltk.download('stopwords')
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w).lower() for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_bgc_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('bgc',GradientBoostingClassifier(n_estimators=50,verbose=2))])

text_bgc_stemmed = text_bgc_stemmed.fit(train.data, train.target)

predicted_bgc_stemmed = text_bgc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_bgc_stemmed))

[nltk_data] Downloading package stopwords to /home/shanni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


      Iter       Train Loss   Remaining Time 
         1           2.1888            4.78m
         2           1.9583            4.55m
         3           1.8005            4.40m
         4           1.6757            4.27m
         5           1.5768            4.16m
         6           1.4915            4.06m
         7           1.4182            3.97m
         8           1.3564            3.87m
         9           1.2984            3.77m
        10           1.2519            3.67m
        11           1.2024            3.57m
        12           1.1647            3.47m
        13           1.1230            3.37m
        14           1.0882            3.28m
        15           1.0538            3.18m
        16           1.0245            3.10m
        17           0.9960            3.00m
        18           0.9676            2.91m
        19           0.9413            2.82m
        20           0.9180            2.72m
        21           0.8965            2.63m
        2

In [10]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
class LemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w).lower() for w in analyzer(doc)])
    
lemmed_count_vect = LemmedCountVectorizer(stop_words='english')

text_bgc_stemmed = Pipeline([('vect', lemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('bgc', GradientBoostingClassifier(n_estimators=50,verbose=2))])

text_bgc_stemmed = text_bgc_stemmed.fit(train.data, train.target)

predicted_bgc_stemmed = text_bgc_stemmed.predict(test.data)
print("accuracy:",accuracy_score(test.target,predicted_bgc_stemmed))

[nltk_data] Downloading package wordnet to /home/shanni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


      Iter       Train Loss   Remaining Time 
         1           2.1967            4.46m
         2           1.9696            4.36m
         3           1.8066            4.28m
         4           1.6803            4.18m
         5           1.5785            4.10m
         6           1.4942            4.01m
         7           1.4199            3.91m
         8           1.3577            3.82m
         9           1.3011            3.73m
        10           1.2484            3.63m
        11           1.2026            3.54m
        12           1.1635            3.46m
        13           1.1263            3.37m
        14           1.0887            3.29m
        15           1.0554            3.20m
        16           1.0277            3.10m
        17           1.0005            3.01m
        18           0.9722            2.92m
        19           0.9480            2.83m
        20           0.9252            2.74m
        21           0.9005            2.65m
        2