In [6]:
%cat dvc.yaml

stages:
  multiclass:
    cmd: python multiclass_pipe.py
    deps:
      - multiclass_pipe.py
      - ../../../data/mtg.feather
    outs:
      - multiclass_example.sav
    params:
      - preprocessing.ngrams
    metrics:
      - metrics.json:
          cache: false


In [7]:
%cat params.yaml

preprocessing:
  ngrams:
    smallest: 1
    largest: 2


In [9]:
%cat metrics.json

{"precision":0.796564367,"recall":0.7966616085,"f1-score":0.7964695552,"support":2636.0}

In [10]:
!dvc exp diff exp-1fbbd

Path          Metric     exp-1fbbd    workspace    Change
metrics.json  f1-score   0.75669      0.79647      0.039775
metrics.json  precision  0.75665      0.79656      0.03991
metrics.json  recall     0.75683      0.79666      0.039833

Path         Param                         exp-1fbbd    workspace    Change
params.yaml  preprocessing.ngrams.largest  1            2            1
[0m

In [5]:
%cat multiclass_pipe.py

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import yaml
import pandas as pd
import pickle

with open("params.yaml", "r") as fd:
    params = yaml.safe_load(fd)

ngrams = params["preprocessing"]["ngrams"]

tokenize = re.compile(
    r"(?:\#[\w\d]+\b)"
    r"|(?:\b\w[\/\&]\w)\b"
    r"|(?:\b\w[\w\'\d]+)\b"
    r"|(?:\{\w\})"  # mana
    r"|(?:[+-]\d\d?(?:/[+-]\d\d?)?)"  # tokens
)

df = (
    pd.read_feather("../../../data/mtg.feather")
    .drop_duplicates(  # <-- will need to change for your notebook locatio
        subset=["name"]
    )
    .assign(
        color_singles=lambda df: df["color_identity"]
        .where(df["color_identity"].str.len() == 1, "")
        .str[0]
    )[["text", "flavor_text", "color_singles"]]
    .dropna()
