<a href="https://colab.research.google.com/github/simon-clematide/colab-notebooks-for-teaching/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Text Classification Pipeline with sklearn
(taken from https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_semi_supervised_newsgroups.html)

In [68]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

np.set_printoptions(precision=3)

# Loading dataset containing first five categories
categories = [
        "alt.atheism",
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware",
        "comp.sys.mac.hardware",
    ]

# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
data = fetch_20newsgroups(
    subset="train",
    categories=categories,
)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names), data.target_names)

# labels are already encoded numerically
print("Example:")
print(data.data[0], data.target[0])

# Hyperparameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)


2823 documents
5 categories ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
Example:
From: hades@coos.dartmouth.edu (Brian V. Hughes)
Subject: Re: New Apple Ergo-Mouse
Reply-To: hades@Dartmouth.Edu
Organization: Dartmouth College, Hanover, NH
Disclaimer: Personally, I really don't care who you think I speak for.
Moderator: Rec.Arts.Comics.Info
Lines: 19

nwcs@utkvx.utk.edu (Schizophrenia means never being alone) writes:

>Does anyone know how to open up the Apple Ergo-Mouse (ADB Mouse II)?
>Mine lives near a cat (true, really...) and picks up her fur.  From what
>I can tell, it looks like Apple welded it shut.

    You must not have tried very hard. I just opend mine in about 2
seconds. Take a look on the bottom, it has a dial that turns to open
much like the older ADB mouses used to have. It's a bit harder to turn
at first but it is quite simple to open.

>Also, does anyone know about installing FPUs in a Mac LC III?  I'

In [46]:
# inspect pipeline
pipeline

In [64]:
def train_predict_eval(clf, X_train, y_train, X_test, y_test, categories):
    print("Number of training samples:", len(X_train))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Number of test samples:", len(y_pred))
    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print(classification_report(y_test, y_pred, target_names=categories))
    print(confusion_matrix(y_test, y_pred, normalize='all',))
    print(y_test, y_pred)
    return y_pred

In [65]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

print("Supervised SGDClassifier on 100% of the data:")
train_predict_eval(pipeline, X_train, y_train, X_test, y_test, data.target_names)


Supervised SGDClassifier on 100% of the data:
Number of training samples: 2117
Number of test samples: 706
Micro-averaged F1 score on test set: 0.898
                          precision    recall  f1-score   support

             alt.atheism       1.00      0.99      1.00       130
           comp.graphics       0.89      0.90      0.89       146
 comp.os.ms-windows.misc       0.83      0.94      0.88       131
comp.sys.ibm.pc.hardware       0.86      0.81      0.83       149
   comp.sys.mac.hardware       0.94      0.87      0.90       150

                accuracy                           0.90       706
               macro avg       0.90      0.90      0.90       706
            weighted avg       0.90      0.90      0.90       706

[[0.183 0.001 0.    0.    0.   ]
 [0.    0.187 0.013 0.006 0.001]
 [0.    0.004 0.174 0.006 0.001]
 [0.    0.016 0.016 0.17  0.01 ]
 [0.    0.003 0.008 0.017 0.184]]
[0 3 4 3 2 2 0 2 3 2 2 4 4 4 2 1 1 3 3 2 2 4 4 3 3 3 4 0 3 1 2 3 0 2 3 2 0
 4 4 1 4 3 0

array([0, 3, 4, 1, 2, 3, 0, 2, 3, 2, 2, 4, 4, 4, 2, 1, 1, 3, 3, 1, 2, 4,
       4, 3, 3, 3, 4, 0, 3, 1, 2, 3, 0, 2, 3, 2, 0, 4, 4, 1, 4, 3, 0, 1,
       4, 4, 0, 1, 2, 0, 4, 0, 2, 2, 4, 2, 0, 2, 3, 0, 1, 0, 2, 4, 3, 4,
       4, 0, 2, 2, 1, 0, 0, 4, 4, 2, 0, 1, 1, 3, 0, 4, 0, 4, 1, 2, 2, 0,
       3, 4, 0, 2, 1, 0, 0, 1, 1, 1, 1, 3, 2, 2, 4, 2, 3, 3, 2, 4, 2, 0,
       0, 1, 4, 0, 1, 2, 1, 2, 0, 3, 0, 2, 4, 4, 0, 2, 1, 3, 3, 0, 0, 2,
       4, 3, 3, 2, 3, 0, 0, 0, 4, 4, 4, 1, 2, 3, 0, 3, 3, 1, 3, 1, 3, 2,
       2, 3, 2, 3, 3, 3, 4, 3, 2, 3, 2, 2, 1, 2, 0, 3, 2, 0, 2, 3, 3, 4,
       4, 4, 0, 4, 1, 4, 4, 4, 1, 1, 2, 2, 4, 3, 1, 3, 1, 1, 1, 4, 3, 2,
       0, 3, 3, 2, 1, 1, 0, 2, 2, 2, 1, 1, 1, 1, 4, 2, 3, 1, 1, 1, 3, 2,
       0, 3, 3, 3, 4, 2, 2, 3, 2, 0, 1, 3, 2, 0, 1, 1, 4, 0, 0, 0, 4, 1,
       4, 1, 4, 0, 0, 2, 4, 4, 2, 4, 1, 2, 0, 1, 4, 1, 1, 1, 4, 2, 3, 1,
       2, 4, 3, 2, 0, 3, 2, 3, 2, 0, 3, 3, 3, 0, 2, 1, 4, 1, 1, 2, 1, 4,
       0, 2, 3, 3, 0, 1, 3, 1, 0, 4, 2, 1, 2, 3, 3,

Further ideas
 - Confusion Heatmat https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html


In [49]:
 -

SyntaxError: ignored