#  Text Classification

Algorithms: Multinomial Naïve Bayes, Logistic Regression, Support Vector Machines, Decision Trees

Feature Extractors: CountVectorizer, TD-IDF

In [1]:
#from pprint import pprint
#from time import time
#import loggin|g

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

## Load Data Set

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc'
]

In [3]:
train_data = fetch_20newsgroups(subset='train',categories=categories)
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
test_data = fetch_20newsgroups(subset='test',categories=categories)

## Pipeline with Feature Extractions

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

### Multinomial Naïve Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mnb', MultinomialNB())
])

parameters = {
'mnb__alpha': [91, 0.1, 0.01, 0.001, 0.0001] 
}

MNB_GS = GridSearchCV(pipeline, parameters, n_jobs = -1, scoring='roc_auc')


In [7]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'mnb', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'mnb__alpha', 'mnb__class_prior', 'mnb__fit_prior'])

In [8]:
MNB_GS.fit(train_data.data, train_data.target)

GridSearchCV(estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': [91, 0.1, 0.01, 0.001, 0.0001]},
             scoring='roc_auc')

In [9]:
MNB_GS.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('mnb', MultinomialNB(alpha=0.01))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'mnb': MultinomialNB(alpha=0.01),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'mnb__alpha': 0.01,
 'mnb__class_prior': None,
 'mnb__fit_prior': True}

In [10]:
MNB_GS.best_score_

0.9867602339181287

 ### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

LRpipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression())
])

LRparameters = { 'lr__max_iter': (25,50,100,150),
            'lr__C':(0.5,1.0,3.0,4.0)
           
}
lr_GS = GridSearchCV(LRpipeline, LRparameters, n_jobs = -1,scoring='roc_auc')

In [None]:
LRparameters = { 'lr__max_iter': (25,50,100,150),
            'lr__verbose':(0,1,3,5),
           'lr__C':(0.5,1.0,3.0,4.0)
}

In [6]:
lr_GS.fit(train_data.data, train_data.target)

GridSearchCV(estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'lr__C': (0.5, 1.0, 3.0, 4.0),
                         'lr__max_iter': (25, 50, 100, 150)},
             scoring='roc_auc')

In [7]:
lr_GS.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('lr', LogisticRegression(C=4.0, max_iter=25))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'lr': LogisticRegression(C=4.0, max_iter=25),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'lr__C': 4.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__ma

In [8]:
lr_GS.best_score_

0.9856695906432748

### Support Vector Machines

In [11]:
from sklearn.svm import SVC

SVMpipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svm', SVC())
])

SVMparameters = { 
            'svm__verbose':[0,1,2,3,4,5],
           'svm__C':[0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0],
            'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

svm_GS = GridSearchCV(SVMpipeline, SVMparameters, n_jobs = -1, scoring='roc_auc')



In [12]:
svm_GS.fit(train_data.data, train_data.target)

GridSearchCV(estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('svm', SVC())]),
             n_jobs=-1,
             param_grid={'svm__C': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0],
                         'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'svm__verbose': [0, 1, 2, 3, 4, 5]},
             scoring='roc_auc')

In [13]:
svm_GS.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('svm', SVC(C=4.0, gamma=0.1, verbose=0))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'svm': SVC(C=4.0, gamma=0.1, verbose=0),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'svm__C': 4.0,
 'svm__break_ties': False,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'svm__coef0': 0.0,
 'svm__decision_function_shape': 'ovr',
 'sv

In [14]:
svm_GS.best_score_

0.9879082602339182

### Decision Trees

In [5]:
from sklearn.tree import DecisionTreeClassifier

DTpipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dt', DecisionTreeClassifier())
])

DTparameters = {
               'dt__max_depth': [3,5,7,10,14,16,20],
               'dt__max_leaf_nodes': [10,20,30,40,50],
               'dt__min_samples_split': [2,4,8,10],
               }

dt_GS = GridSearchCV(DTpipeline, DTparameters, n_jobs = -1,scoring='roc_auc')


In [6]:
dt_GS.fit(train_data.data, train_data.target)

GridSearchCV(estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'dt__max_depth': [3, 5, 7, 10, 14, 16, 20],
                         'dt__max_leaf_nodes': [10, 20, 30, 40, 50],
                         'dt__min_samples_split': [2, 4, 8, 10]},
             scoring='roc_auc')

In [7]:
dt_GS.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('dt', DecisionTreeClassifier(max_depth=20, max_leaf_nodes=20))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'dt': DecisionTreeClassifier(max_depth=20, max_leaf_nodes=20),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'dt__ccp_alpha': 0.0,
 'dt__class_weight': None,
 'dt__criterion': 'gini',
 'dt__max_depth': 20,
 'dt__max_features

In [8]:
dt_GS.best_score_

0.8927054093567252