In [29]:
import pickle
import numpy as np
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Business Classifier
**Goal:** Create a classifer using the sklearn.datasets fetch_20newsgroups capability.

**Background:** In lieu of having a pre-labeled dataset of business websites versus non-business websites this is the best way to create a similar classification capability. 

**Approach:** I'm going to start by trying out a couple of different classifiers to see which one has the best accuracy scores.

In [30]:
#Populate the training data
newsgroups_train = fetch_20newsgroups(subset='train')

In [31]:
#Let's see what the data actually looks like
newsgroups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [32]:
#Create a Term-Frequency, Inverse Document Frequency word vector
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 130107)

In [33]:
#Let's grab our test data
newsgroups_test = fetch_20newsgroups(subset='test')
vectors_test = vectorizer.transform(newsgroups_test.data)

#Start with Multinomial Naive Bayes
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(vectors, newsgroups_train.target)

pred = nb_clf.predict(vectors_test)

In [34]:
#prints accuracy metrics for this model
def print_accuracy_scores(model, X_test, y_test):
    model_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, model_pred)
    #print(f'Model Accuracy: {round(accuracy, 4)*100}')
    print('Accuracy: %f' % accuracy)
    precision = precision_score(y_test, model_pred, average='macro')
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_test, model_pred, average='macro')
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, model_pred, average='macro')
    print('F1 score: %f' % f1)
    #auc = roc_auc_score(y_test, model_pred)
    #print('ROC AUC: %f' % auc)

In [35]:
print_accuracy_scores(nb_clf, vectors_test, newsgroups_test.target)

Accuracy: 0.826341
Precision: 0.839394
Recall: 0.813973
F1 score: 0.813889


In [38]:
def show_top10_words(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        cat = "\033[0;37;41m" + category + "\033[0m"
        print("%s: %s" % (cat, " ".join(feature_names[top10])))

In [39]:
show_top10_words(nb_clf, vectorizer, newsgroups_train.target_names)

[0;37;41malt.atheism[0m: keith it and you in that is to of the
[0;37;41mcomp.graphics[0m: edu in for it is and graphics of to the
[0;37;41mcomp.os.ms-windows.misc[0m: file for of and edu is it to the windows
[0;37;41mcomp.sys.ibm.pc.hardware[0m: card ide is of it drive and scsi to the
[0;37;41mcomp.sys.mac.hardware[0m: in it is and of edu apple mac to the
[0;37;41mcomp.windows.x[0m: it mit in motif and is of window to the
[0;37;41mmisc.forsale[0m: shipping offer of 00 to and edu the for sale
[0;37;41mrec.autos[0m: that is you it in of and to car the
[0;37;41mrec.motorcycles[0m: dod you it com in of and bike to the
[0;37;41mrec.sport.baseball[0m: that is baseball and of in to he edu the
[0;37;41mrec.sport.hockey[0m: ca game he team and hockey of in to the
[0;37;41msci.crypt[0m: chip that encryption is and clipper key of to the
[0;37;41msci.electronics[0m: for edu you it in is and of to the
[0;37;41msci.med[0m: edu pitt that it in and is to of the
[0;37;41msc

**The main topics for the categories above do not look right. In reading through the documentation I think I will need to remove headers, footers and quotes from the dataset**

In [40]:
#remove headers, footers and quotes from datasets
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))

#create a new vectorizer that removes english stop words and remove 
#words that appear in more than 50% of websites.
vectorizer = TfidfVectorizer(max_df=0.5, stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data)

nb_clf = MultinomialNB(alpha=.01)
nb_clf.fit(vectors, newsgroups_train.target)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [41]:
#Generate predictions on test data
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = nb_clf.predict(vectors_test)

In [42]:
print_accuracy_scores(nb_clf, vectors_test, newsgroups_test.target)

Accuracy: 0.784121
Precision: 0.788334
Recall: 0.773601
F1 score: 0.773819


In [43]:
show_top10_words(nb_clf, vectorizer, newsgroups_train.target_names)

[0;37;41malt.atheism[0m: islam atheists say just religion atheism think don people god
[0;37;41mcomp.graphics[0m: looking format 3d know program file files thanks image graphics
[0;37;41mcomp.os.ms-windows.misc[0m: card problem thanks driver drivers use files dos file windows
[0;37;41mcomp.sys.ibm.pc.hardware[0m: monitor disk thanks pc ide controller bus card scsi drive
[0;37;41mcomp.sys.mac.hardware[0m: know monitor does quadra simms thanks problem drive apple mac
[0;37;41mcomp.windows.x[0m: using windows x11r5 use application thanks widget server motif window
[0;37;41mmisc.forsale[0m: asking email sell price condition new shipping offer 00 sale
[0;37;41mrec.autos[0m: don ford new good dealer just engine like cars car
[0;37;41mrec.motorcycles[0m: don just helmet riding like motorcycle ride bikes dod bike
[0;37;41mrec.sport.baseball[0m: braves players pitching hit runs games game baseball team year
[0;37;41mrec.sport.hockey[0m: league year nhl games season players

**Results above look way better after removing stop words and all footers and headers**

**I will now run a few more models to see if I can improve on the accuracy score**

# Model Evaluation

In [44]:
#Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(vectors, newsgroups_train.target);
pred = rf.predict(vectors_test)



In [45]:
print_accuracy_scores(rf, vectors_test, newsgroups_test.target)

Accuracy: 0.575146
Precision: 0.594177
Recall: 0.561490
F1 score: 0.563994


In [46]:
#Decision Tree Classifier
dtclf = DecisionTreeClassifier()
dtclf = dtclf.fit(vectors, newsgroups_train.target)

In [47]:
print_accuracy_scores(dtclf, vectors_test, newsgroups_test.target)

Accuracy: 0.520181
Precision: 0.505519
Recall: 0.508445
F1 score: 0.502841


In [48]:
#LinearSVC
svc = LinearSVC(penalty='l2', dual=False, tol=1e-3)
svc.fit(vectors, newsgroups_train.target)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [49]:
#Best results so far
print_accuracy_scores(svc, vectors_test, newsgroups_test.target)

Accuracy: 0.803638
Precision: 0.805141
Recall: 0.793232
F1 score: 0.794410


In [50]:
show_top10_words(svc, vectorizer, newsgroups_train.target_names)

[0;37;41malt.atheism[0m: cruel atheists religion islamic islam nanci deletion bobby motto atheism
[0;37;41mcomp.graphics[0m: cview 3do animation pov tiff 68070 images image 3d graphics
[0;37;41mcomp.os.ms-windows.misc[0m: winqvt w4wg smartdrv mfc ini ax risc win3 cica windows
[0;37;41mcomp.sys.ibm.pc.hardware[0m: irq cmos fastmicro t560i orchid bios 486 gateway ide vlb
[0;37;41mcomp.sys.mac.hardware[0m: c650 duo centris lc adb quadra se powerbook apple mac
[0;37;41mcomp.windows.x[0m: binaries xlib widgets window mit x11r5 xterm server widget motif
[0;37;41mmisc.forsale[0m: pay uhc includes interested asking condition sell shipping offer sale
[0;37;41mrec.autos[0m: autos sho gt toyota dealer vw oil ford cars car
[0;37;41mrec.motorcycles[0m: bmw motorcycles harley riding motorcycle helmet bikes ride dod bike
[0;37;41mrec.sport.baseball[0m: yankees royals ball sox alomar braves cubs phillies stadium baseball
[0;37;41mrec.sport.hockey[0m: ice devils playoffs leafs mas

In [24]:
logreg = LogisticRegression()
logreg.fit(vectors, newsgroups_train.target)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
print_accuracy_scores(logreg, vectors_test, newsgroups_test.target)

Accuracy: 0.792884
Precision: 0.799006
Recall: 0.778382
F1 score: 0.777659


In [26]:
show_top10_words(logreg, vectorizer, newsgroups_train.target_names)

alt.atheism: punishment atheist motto deletion bobby islamic atheists islam religion atheism
comp.graphics: polygon pov cview tiff files format images 3d image graphics
comp.os.ms-windows.misc: win3 risc fonts files drivers driver cica ax file windows
comp.sys.ibm.pc.hardware: bios 486 monitor drive card ide controller bus pc scsi
comp.sys.mac.hardware: nubus powerbook duo simms lc se centris quadra apple mac
comp.windows.x: widgets sun application mit x11r5 xterm widget server motif window
misc.forsale: new interested asking email 00 condition sell shipping offer sale
rec.autos: gt vw auto toyota oil dealer ford engine cars car
rec.motorcycles: motorcycles dog bmw riding helmet motorcycle ride bikes dod bike
rec.sport.baseball: phillies ball cubs pitching stadium hit braves runs year baseball
rec.sport.hockey: puck playoffs leafs players play season nhl team game hockey
sci.crypt: des crypto security chip keys government nsa encryption clipper key
sci.electronics: tv current amp outpu

# Conclusion
After carefully evaluating a few classification models, LinearSVC seems to preform the best. I will now use this to create a class that will be used to run against the common crawl dataset