In [1]:
#Problem Setup/Definition:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from string import punctuation
from string import digits
from nltk.corpus import wordnet
from sklearn.feature_extraction import text, stop_words
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.svm import LinearSVC
import math 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from nltk import SnowballStemmer
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline



In [2]:
def my_custom_preprocessor(doc_string):
    # do all data preprocessing here
    
    # Lower case
    doc_string=doc_string.lower()
    
    # Remove Numbers
    remove_digits = str.maketrans('', '', digits)
    doc_string.translate(remove_digits)
    
    # Convert to tokenized form....
    tokens = nltk.tokenize.word_tokenize(doc_string)
    # Iterate through list of tokens (words) and remove all numbers
    tokens = [word for word in tokens if word.isalpha()]
    # Iterate through list of tokens (words) and stem (shorten) each word
    port_stemmer = PorterStemmer()
    tokens = [port_stemmer.stem(words) for words in tokens ]
    
    ###############################
    #### Lemmatize with pos_tag ###
    ###############################
    
    lemmatizer = WordNetLemmatizer()
    
    # Convert between two different tagging schemes
    def change_tags(penntag):
        morphy_tag = {'NN':'n', 'JJ':'a',
                      'VB':'v', 'RB':'r'}
        try:
            return morphy_tag[penntag[:2]]
        except:
            return 'n'
        
    tokens = [lemmatizer.lemmatize(word.lower(), pos=change_tags(tag)) for word, tag in pos_tag(tokens)]
    
    # Rejoin List of tokens and return that single document-string
    return ' '.join(tokens)

###########################
#### RoC Curve Function ###
###########################

def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)

def fit_predict_and_plot_roc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)

    if hasattr(pipe, 'decision_function'):
        prob_score = pipe.decision_function(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score)
    else:
        prob_score = pipe.predict_proba(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score[:,1])

    plot_roc(fpr, tpr)
    
#####################################################
#### Define Custom stop words for CountVectorizer ###
#####################################################

stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

# Run stop_words through the same pre-processor as the document-matrix
# This will apply stemmed/lemmatized stop_woirds to stemmed/lemmatized tokenized document lists
def process_stop_words(stop_word_set):
    doc_string = ' '.join(stop_word_set)
    return my_custom_preprocessor(doc_string).split()

################################
#### Estimator Helper Class  ###
################################

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

##################################
#### Import Dataset Train/Test ###
##################################

# Only take a specific selection (4) of the 20 available categories
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                'misc.forsale', 'soc.religion.christian']

# Load a training & test data sets consisting of those 8 categories
train_dataset = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = None)
test_dataset = fetch_20newsgroups(subset = 'test', categories = categories, shuffle = True, random_state = None)

# Categories are mapped 0-4, (0-3) = Comp, (4-7) = Recreation
# 0: comp.sys.ibm.pc.hardware
# 1: comp.sys.mac.hardware
# 2: misc.forsale
# 3: soc.religion.christian

In [3]:
###############################
#### Trim Data Beforehand?? ###
###############################
## Using Discussion section code snippets
wnl = nltk.wordnet.WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'
    
def lemmatize_sent(list_word):
    # Text input is string, returns array of lowercased strings(words).
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(list_word)]

analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

def stem_rmv_punc(doc):
    return (word for word in lemmatize_sent(analyzer(doc)) if word not in combined_stopwords and not word.isdigit())

In [4]:
###############################
#### Naive Bayes Multiclass ###
###############################
# Define the CountVectorizer = document-term matrix
train_count_vectorizer = CountVectorizer(min_df=3,analyzer=stem_rmv_punc, stop_words='english')
test_count_vectorizer = CountVectorizer(min_df=3,analyzer=stem_rmv_punc, stop_words='english')

# Fit + count the train_doc_term_matrix: setting the vocabulary (word-features) based on words found in the train_dataset
train_doc_term_matrix = train_count_vectorizer.fit_transform(train_dataset.data)

# Fit the test_count_doc_term_matric to train_dataset words; then count the occurence of those words in the test_dataset
test_count_vectorizer.fit(train_dataset.data)
test_count_doc_term_matrix = test_count_vectorizer.transform(test_dataset.data)

# Start TD-DIF Transform process; created TDIF matrix with train_doc_term_matrix vocabulary; apply test_dataset transform
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_doc_term_matrix)
test_tfidf = tfidf_transformer.fit_transform(test_count_doc_term_matrix)

# Dimensionality Reduction: NMF
nmf_settings = NMF(n_components=50, init='random', random_state=0)
reduced_NMF_train_matrix = nmf_settings.fit_transform(train_tfidf)
reduced_NMF_test_matrix = nmf_settings.transform(test_tfidf)

nmf_settings_components = nmf_settings.components_
print("Shape of reduced TRAIN tf-idf matrix after NMF Dimensionality Reduction (top 50 words): " + str(reduced_NMF_train_matrix.shape))
print("Shape of reduced TEST tf-idf matrix after NMF Dimensionality Reduction (top 50 words): " + str(reduced_NMF_train_matrix.shape))
print("\n\n" + '-'*40 + "\n\n")

# Gaussian Classifier: 'train_dataset.target' Now consists of 4 categories (0-3)
train_gaus_model = GaussianNB().fit(reduced_NMF_train_matrix, train_dataset.target)
predict_gaus_model = train_gaus_model.predict(reduced_NMF_test_matrix)




Shape of reduced TRAIN tf-idf matrix after NMF Dimensionality Reduction (top 50 words): (2352, 50)
Shape of reduced TEST tf-idf matrix after NMF Dimensionality Reduction (top 50 words): (2352, 50)


----------------------------------------




In [5]:
# Plot Gaus Model Statistics:
gaus_accuracy = metrics.accuracy_score(test_dataset.target, predict_gaus_model)
gaus_precision = metrics.precision_score(test_dataset.target, predict_gaus_model, average='macro')
gaus_recall = metrics.recall_score(test_dataset.target, predict_gaus_model, average='macro')
gaus_f1 = 2 * (gaus_precision * gaus_recall) / (predict_gaus_model + gaus_recall)
print("############## Naive Bayes: Gaus Stats ############### \n")
print("Accuracy: " + str(gaus_accuracy))
print("Precision: " + str(gaus_precision))
print("Recall: " + str(gaus_recall))
print("F1-score: "+ str(gaus_f1))
print("\nConfusion Matrix: \n\n" + str(metrics.confusion_matrix(test_dataset.target, predict_gaus_model)))
print("\nMulticlass metrics: ")
print(metrics.classification_report(test_dataset.target, predict_gaus_model, target_names=['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']))


############## Naive Bayes: Gaus Stats ############### 

Accuracy: 0.7559105431309904
Precision: 0.7572224492772439
Recall: 0.7542536907935328
F1-score: [0.41473146 0.65114622 0.41473146 ... 0.30426171 0.41473146 0.65114622]

Confusion Matrix: 

[[276  42  66   8]
 [ 86 222  73   4]
 [ 56  34 295   5]
 [  2   2   4 390]]

Multiclass metrics: 
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.66      0.70      0.68       392
   comp.sys.mac.hardware       0.74      0.58      0.65       385
            misc.forsale       0.67      0.76      0.71       390
  soc.religion.christian       0.96      0.98      0.97       398

                accuracy                           0.76      1565
               macro avg       0.76      0.75      0.75      1565
            weighted avg       0.76      0.76      0.75      1565



In [6]:
##################################
#### Linear SVC Multiclass 1v1 ###
##################################
# Use OneVsOneClassifier() to get the correct decision_function() return shape
linear_classifier_1v1 = OneVsOneClassifier(SVC(C=100.0, max_iter=5000, kernel='linear',random_state=0))
fitted_linear_classifier_1v1 = linear_classifier_1v1.fit(reduced_NMF_train_matrix, train_dataset.target)
predicted_linear_classifier_1v1 = fitted_linear_classifier_1v1.predict(reduced_NMF_test_matrix)

# Plot Linear SVC 1v1 Statistics:
linear_1v1__accuracy = metrics.accuracy_score(test_dataset.target, predicted_linear_classifier_1v1)
linear_1v1__precision = metrics.precision_score(test_dataset.target, predicted_linear_classifier_1v1, average='macro')
linear_1v1__recall = metrics.recall_score(test_dataset.target, predicted_linear_classifier_1v1, average='macro')
linear_1v1__f1 = 2 * (linear_1v1__precision * linear_1v1__recall) / (predicted_linear_classifier_1v1 + linear_1v1__recall)
print("############## Linear SVC Multiclass 1v1 ############### \n")
print("Accuracy: " + str(linear_1v1__accuracy))
print("Precision: " + str(linear_1v1__precision))
print("Recall: " + str(linear_1v1__recall))
print("F1-score: "+ str(linear_1v1__f1))
print("\nConfusion Matrix: \n\n" + str(metrics.confusion_matrix(test_dataset.target, predicted_linear_classifier_1v1)))
print("\nMulticlass metrics: ")
print(metrics.classification_report(test_dataset.target, predicted_linear_classifier_1v1, target_names=['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']))


############## Linear SVC Multiclass 1v1 ############### 

Accuracy: 0.8440894568690096
Precision: 0.845718313150106
Recall: 0.8431846061332781
F1-score: [0.50161826 0.77376586 0.50161826 ... 0.37109675 0.50161826 1.69143663]

Confusion Matrix: 

[[310  57  25   0]
 [ 74 284  25   2]
 [ 33  15 342   0]
 [  7   3   3 385]]

Multiclass metrics: 
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.73      0.79      0.76       392
   comp.sys.mac.hardware       0.79      0.74      0.76       385
            misc.forsale       0.87      0.88      0.87       390
  soc.religion.christian       0.99      0.97      0.98       398

                accuracy                           0.84      1565
               macro avg       0.85      0.84      0.84      1565
            weighted avg       0.85      0.84      0.84      1565



In [7]:
######################################
#### Linear SVC Multiclass 1vrest ###
#####################################
# Use OneVsRestClassifier() to get the correct decision_function() return shape
linear_classifier_1vrest = OneVsRestClassifier(SVC(C=100.0, max_iter=5000, kernel='linear',random_state=0))
fitted_linear_classifier_1vrest = linear_classifier_1vrest.fit(reduced_NMF_train_matrix, train_dataset.target)
predicted_linear_classifier_1vrest = fitted_linear_classifier_1vrest.predict(reduced_NMF_test_matrix)

# Plot Linear SVC 1vrest Statistics:
linear_1vrest_accuracy = metrics.accuracy_score(test_dataset.target, predicted_linear_classifier_1vrest)
linear_1vrest_precision = metrics.precision_score(test_dataset.target, predicted_linear_classifier_1vrest, average='macro')
linear_1vrest_recall = metrics.recall_score(test_dataset.target, predicted_linear_classifier_1vrest, average='macro')
linear_1vrest_f1 = 2 * (linear_1vrest_precision * linear_1vrest_recall) / (predicted_linear_classifier_1vrest + linear_1vrest_recall)
print("############## Linear SVC Multiclass 1vrest ############### \n")
print("Accuracy: " + str(linear_1vrest_accuracy))
print("Precision: " + str(linear_1vrest_precision))
print("Recall: " + str(linear_1vrest_recall))
print("F1-score: "+ str(linear_1vrest_f1))
print("\nConfusion Matrix: \n\n" + str(metrics.confusion_matrix(test_dataset.target, predicted_linear_classifier_1vrest)))
print("\nMulticlass metrics: ")
print(metrics.classification_report(test_dataset.target, predicted_linear_classifier_1vrest, target_names=['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']))


############## Linear SVC Multiclass 1vrest ############### 

Accuracy: 0.853035143769968
Precision: 0.8522017159752234
Recall: 0.8521182260548735
F1-score: [0.50921915 0.78415795 0.50921915 ... 0.37702717 0.50921915 1.70440343]

Confusion Matrix: 

[[309  56  24   3]
 [ 67 288  26   4]
 [ 27  15 347   1]
 [  4   2   1 391]]

Multiclass metrics: 
                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.76      0.79      0.77       392
   comp.sys.mac.hardware       0.80      0.75      0.77       385
            misc.forsale       0.87      0.89      0.88       390
  soc.religion.christian       0.98      0.98      0.98       398

                accuracy                           0.85      1565
               macro avg       0.85      0.85      0.85      1565
            weighted avg       0.85      0.85      0.85      1565

