# Use Pre-trained CNN as feature extractor

Use MobileNetv3 as a feature extractor via the [embetter](https://github.com/koaning/embetter) scikit-learn library and [timm](https://github.com/rwightman/pytorch-image-models). Train a logistic regression classifier in scikit-learn on the embeddings.

![](images/feature-extractor.png)

In [1]:
import os

# pip install gitpython
from git import Repo

if not os.path.exists("mnist-pngs"):
    Repo.clone_from("https://github.com/rasbt/mnist-pngs", "mnist-pngs")

In [2]:
import os
import pandas as pd

for name in ("train", "test"):

    df = pd.read_csv(f"mnist-pngs/{name}.csv")
    df["filepath"] = df["filepath"].apply(lambda x: "mnist-pngs/" + x)
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    df.to_csv(f"mnist-pngs/{name}_shuffled.csv", index=None)

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from tqdm.notebook import tqdm

# pip install "embetter[vision]"
from embetter.vision import ImageLoader, TimmEncoder


embed = make_pipeline(
    ImageLoader(),
    TimmEncoder(name="mobilenetv3_large_100")
)

model = SGDClassifier(loss='log_loss', n_jobs=-1, shuffle=True)

chunksize = 1000
train_labels, train_predict = [], []

for df in tqdm(pd.read_csv("mnist-pngs/train_shuffled.csv", chunksize=chunksize, iterator=True), total=60):
    
    embedded = embed.transform(df["filepath"])
    model.partial_fit(embedded, df["label"], classes=list(range(10)))

  0%|          | 0/60 [00:00<?, ?it/s]

In [4]:
train_labels, train_predict = [], []

for df in tqdm(pd.read_csv("mnist-pngs/train.csv", chunksize=chunksize, iterator=True), total=60):
    df["filepath"] = df["filepath"].apply(lambda x: "mnist-pngs/" + x)

    embedded = embed.transform(df["filepath"])
    train_predict.extend(model.predict(embedded))
    train_labels.extend(list(df["label"].values))

  0%|          | 0/60 [00:00<?, ?it/s]

In [5]:
test_labels, test_predict = [], []

for df in tqdm(pd.read_csv("mnist-pngs/test_shuffled.csv", chunksize=chunksize, iterator=True), total=10):

    embedded = embed.transform(df["filepath"])
    test_predict.extend(model.predict(embedded))
    test_labels.extend(list(df["label"].values))

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
from sklearn.metrics import accuracy_score

print(f"Train accuracy: {accuracy_score(train_labels, train_predict):.2f}")
print(f"Test accuracy: {accuracy_score(test_labels, test_predict):.2f}")

Train accuracy: 0.92
Test accuracy: 0.92
