In [1]:
import joblib
import re
import string

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
categories = [
    "alt.atheism",
    "misc.forsale",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
]

news_group_data = fetch_20newsgroups(
    subset="all", remove=("headers", "footers", "quotes"), categories=categories
)

In [4]:
df = pd.DataFrame(
    dict(
        text=news_group_data["data"],
        target=news_group_data["target"]
    )
)
df["target"] = df.target.map(lambda x: categories[x])

In [5]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.text.map(process_text)

In [6]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)

In [7]:
df

Unnamed: 0,text,target,clean_text
0,At one time there was speculation that the fir...,sci.space,at one time there was speculation that the fir...
1,Apple IIgs\nImagewriter II COLOR printer\nColo...,misc.forsale,apple iigs imagewriter ii color printer color ...
2,\n\nI heard he had asked the FBI to provide hi...,alt.atheism,i heard he had asked the fbi to provide him wi...
3,"Coming from a long line of ""hot tempered"" peop...",soc.religion.christian,coming from a long line of hot tempered people...
4,I'm not sure were this thread has been before ...,talk.politics.guns,i m not sure were this thread has been before ...
...,...,...,...
4663,\nOr perhaps David Koresh didn't listen too we...,talk.politics.guns,or perhaps david koresh didn t listen too well...
4664,Howdy! I'm just posting this for a friend so d...,misc.forsale,howdy i m just posting this for a friend so do...
4665,Great SLR camera (Ricoh) for sale. Has all the...,misc.forsale,great slr camera ricoh for sale has all the ni...
4666,\n\n\n\nYou may want to put Hubble back in the...,sci.space,you may want to put hubble back in the payload...


Up to now, no new data is being considered. The existing data is being split into 'visible' and 'hidden' sets (Training data and Test data respectively). The performance of the model on existing data is being evaluated.

In [8]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.54      0.69       160
          misc.forsale       0.95      0.86      0.91       195
             sci.space       0.88      0.88      0.88       197
soc.religion.christian       0.66      0.95      0.78       200
    talk.politics.guns       0.90      0.87      0.89       182

              accuracy                           0.83       934
             macro avg       0.86      0.82      0.83       934
          weighted avg       0.86      0.83      0.83       934



This model has a confidence score of 83%.

In [10]:
joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

Both the above and below files are also saved in the same working directory as the Notebook, in this case 'C:\\Users\\Admin'. They allow us to resume a previously trained model without having to re-train it from scratch.

In [12]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

sample_text = ["Space, Stars, Planets and Astronomy!"]
# Process the text in the same way you did when you trained it!
clean_sample_text = process_text(sample_text)
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

array(['sci.space'], dtype='<U22')

In [11]:
pwd

'C:\\Users\\Admin'