<a href="https://colab.research.google.com/github/simon-clematide/colab-notebooks-for-teaching/blob/main/sklearn_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Text Classification Pipeline with sklearn
(taken from https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_semi_supervised_newsgroups.html)

In [26]:
#import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# Loading dataset containing first five categories
categories = [
        "alt.atheism",
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware",
        "comp.sys.mac.hardware",
    ]
data = fetch_20newsgroups(
    subset="train",
    categories=categories,
)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# Hyperparameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)


def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test, categories):
    print("Number of training samples:", len(X_train))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Number of test samples:", len(y_pred))
    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print(classification_report(y_test, y_pred, target_names = categories))
    print(confusion_matrix(y_test, y_pred, normalize='all'))

2823 documents
5 categories



In [27]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

print("Supervised SGDClassifier on 100% of the data:")
eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test,categories)


Supervised SGDClassifier on 100% of the data:
Number of training samples: 2117
Number of test samples: 706
Micro-averaged F1 score on test set: 0.895
                          precision    recall  f1-score   support

             alt.atheism       0.99      0.99      0.99       110
           comp.graphics       0.88      0.88      0.88       151
 comp.os.ms-windows.misc       0.92      0.88      0.90       154
comp.sys.ibm.pc.hardware       0.84      0.82      0.83       145
   comp.sys.mac.hardware       0.87      0.93      0.90       146

                accuracy                           0.90       706
               macro avg       0.90      0.90      0.90       706
            weighted avg       0.90      0.90      0.90       706

[[0.15439093 0.00141643 0.         0.         0.        ]
 [0.         0.18838527 0.00566572 0.01274788 0.00708215]
 [0.         0.00849858 0.19121813 0.00991501 0.00849858]
 [0.00141643 0.01274788 0.00849858 0.16855524 0.01416431]
 [0.         0.004249