In [1]:
import joblib
import re
import string

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [4]:
categories = [
    "alt.atheism",
    "misc.forsale",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
]

In [5]:
news_group_data = fetch_20newsgroups(
    subset="all", remove=("headers", "footers", "quotes"), categories=categories
)

In [11]:
df = pd.DataFrame(
    dict(
        text=news_group_data["data"],
        target=news_group_data["target"]
    )
)

In [12]:
df["target"] = df.target.map(lambda x: categories[x])

In [13]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.text.map(process_text)

In [14]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)

In [15]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [16]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

                        precision    recall  f1-score   support

           alt.atheism       0.96      0.53      0.68       160
          misc.forsale       0.98      0.89      0.93       195
             sci.space       0.88      0.87      0.88       197
soc.religion.christian       0.66      0.97      0.79       200
    talk.politics.guns       0.92      0.90      0.91       182

              accuracy                           0.85       934
             macro avg       0.88      0.83      0.84       934
          weighted avg       0.88      0.85      0.84       934



In [17]:
joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

In [18]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

sample_text = ["Space, Stars, Planets and Astronomy!"]
# Process the text in the same way you did when you trained it!
clean_sample_text = process_text(sample_text)
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

array(['sci.space'], dtype='<U22')

In [20]:
news_group_data

{'data': ['At one time there was speculation that the first spacewalk \n(Alexei Leonov ?) was a staged fake.\n\nHas any evidence to support or contradict this claim emerged ?\n\nWas this claim perhaps another fevered Cold War hallucination ?\n',
  'Apple IIgs\nImagewriter II COLOR printer\nColor RGB monitor\n3.5" DRIVE\n5.25" drive\nkeyboard\nMouse\nlots of disks\nsome applications\nmost manuals',
  '\n\nI heard he had asked the FBI to provide him with a word processor.  Does\nanyone know if Koresh has requested that it be WordPerfect5.0?  WP5.0 was\nwritten (and is owned) by Mormons, so the theological implications of\nrequesting (or refusing) WP5.0 are profound!',
  'Coming from a long line of "hot tempered" people, I know temper when I see\nit.  One of the tell tale signs/fruits that give non-christians away - is\nwhen their net replies are acrid, angry and sarcastic.  \n\nWe in the net village do have a laugh or two when professed, born again\nchristians verbally attack people who 