In [2]:
import spacy

In [21]:
import pandas as pd
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 6
df = pd.read_csv("workout_buddy/train_data.csv")
df

Unnamed: 0,Question,Category
0,What are some effective exercises for building strong glutes?,EXERCISE
1,How often should I change my workout routine for optimal results?,TRAINING
2,What's the proper form for a barbell squat?,EXERCISE
...,...,...
227,Details on recent tricep overhead extensions: 12kg x 12 x 3,ACTION
228,Recent seated leg curls: 35kg x 15 x 3 details?,ACTION
229,Outline recent dumbbell hammer curls: 10kg x 12 x 3,ACTION


In [25]:
from typing import Set, List, Tuple
import spacy

nlp = spacy.load("en_core_web_md")

for index, row in df.iterrows():
    print(row.Question)

# l = set(df.Category)
# print(l)

What are some effective exercises for building strong glutes?
How often should I change my workout routine for optimal results?
What's the proper form for a barbell squat?
Can you recommend a high-intensity interval training (HIIT) workout?
How do I prevent lower back pain during deadlifts?
What's a good pre-workout meal for energy?
What's the best way to increase my running endurance?
Can you suggest a workout plan for fat loss?
What's the recommended rest time between sets for muscle hypertrophy?
What are some ways to stay motivated to exercise regularly?
How do I target the inner chest during my chest workouts?
What's the importance of warming up before a workout?
Can you recommend exercises for improving flexibility?
How do I avoid overtraining and burnout?
What's the difference between free weights and machines for strength training?
What's the best time of day to work out for maximum benefits?
How can I incorporate more protein into my vegetarian diet for muscle building?
What's 

In [27]:
from spacy.tokens import DocBin

def convert(df: pd.DataFrame, outfile:str):
    nlp = spacy.blank("en")
    db = DocBin()

    categories = list(set(df.Category))

    for _, row in df.iterrows():
        doc = nlp.make_doc(row.Question)
        doc.cats = {category: 0 for category in categories}
        doc.cats[row.Category] = 1
        db.add(doc)

    db.to_disk(outfile)



def make_docs(data: List[Tuple[str, str]], target_file: str, cats: Set[str]):
    nlp = spacy.load("en_core_web_md")
    docs = DocBin()
    # Use nlp.pipe to efficiently process a large number of text inputs,
    # the as_tuple arguments enables giving a list of tuples as input and
    # reuse it in the loop, here for the labels
    for doc, label in nlp.pipe(data, as_tuples=True):
        # Encode the labels (assign 1 the subreddit)
        for cat in cats:
            doc.cats[cat] = 1 if cat == label else 0
        docs.add(doc)
    docs.to_disk(target_file)
    return docs

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df["Question"].values, df["Category"].values, test_size=0.3)
categories = list(set(df.Category))

make_docs(list(zip(X_train, y_train)), "train.spacy", cats=categories)
make_docs(list(zip(X_valid, y_valid)), "valid.spacy", cats=categories)

<spacy.tokens._serialize.DocBin at 0x27540db7690>

In [30]:
from spacy.cli.train import train as spacy_train
config_path = "spacy_textcat/config.cfg"
output_model_path = "output/workout_buddy"
spacy_train(
    config_path,
    output_path=output_model_path,
    overrides={
        "paths.train": "train.spacy",
        "paths.dev": "valid.spacy",
    },
)

[38;5;2m✔ Created output directory: output\workout_buddy[0m
[38;5;4mℹ Saving to output directory: output\workout_buddy[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.19        0.00    0.00
 11     200         16.14       57.48    0.57
 26     400          2.99       72.94    0.73
 43     600          1.12       82.98    0.83
 65     800          0.56       83.34    0.83
 91    1000          0.32       82.89    0.83
123    1200          0.19       82.89    0.83
162    1400          0.13       84.15    0.84
209    1600          0.08       84.15    0.84
265    1800          0.06       84.92    0.85
332    2000          0.04       84.24    0.84
417    2200          0.03       83.75    0.84
517    2400          0.02       83.75    0.84
617    2600          0.01       83.