# Demo notebook for spacy models
SSC, April 2023

This demonstrates usage of the spacy NLP modelling methods. 
If you prepare your data on your google drive you can import and export directly to it.

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq

    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !mkdir -p data && unzip -qq test_data.zip && mv -f moralization-test_data/*_Data ./data/. && rm -rf moralization-test_data test_data.zip
    !spacy download de_core_news_sm
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
from moralization import DataManager, SpacyModelManager

### Import training data using DataManager

If you need more information about raised warnings run: <br>
```import logging ``` <br>
```logging.getLogger().setLevel(logging.DEBUG)```

In [None]:
import logging
logging.getLogger().setLevel(logging.DEBUG)

In [None]:

# train on small dataset
data_manager = DataManager("../../moralization_data/Test_Data/")
# train on full dataset
# data_manager = DataManager("/content/data/All_Data/XMI_11") 

In [None]:
for title, doc in data_manager.doc_dict.items():
    print(f"  - {title}: {len(doc)} tokens")

### Create a new spacy model using ModelManager

- `model_path`: where the ModelManager will store the model and data
- `base_config_file`: spacy model base config file
  - Base configs can be autogenerated here: https://spacy.io/usage/training#quickstart
  - If not provided a default with `Language = "german"` and `Components = "spancat"` is used
- `overwrite_existing_files`: enable this to delete any existing files in `model_path`

In [None]:
my_model = SpacyModelManager("my_model", overwrite_existing_files=True)

In [None]:
print(my_model)

### Edit metadata

- `metadata` is a dictionary of metadata for the model
- Initally empty apart from default `pipeline` name and `0.0.0` version
- This metadata will also be used to generate the Model Card on hugging face

In [None]:
my_model.metadata

In [None]:
my_model.metadata["name"] = "test_pipeline"
my_model.metadata["version"] = "0.1.0"
my_model.metadata["description"] = "A test pipeline for ModelManager testing purposes"
my_model.metadata["author"] = "Liam Keegan"
my_model.metadata["email"] = "liam@keegan.ch"
my_model.metadata["url"] = "https://github.com/ssciwr/moralization"
my_model.metadata["license"] = "MIT"

In [None]:
my_model.metadata

In [None]:
print(my_model)

### Train the model

- `data_manager`: the data to be used for training the model
- `overrides`: an optional dictionary of model config settings to override

In [None]:
my_model.train(data_manager, overrides={"training.max_epochs": 20}, check_data_integrity = False)

### Evaluate the model

- `data_manager`: the test data from this data_manager will be used to evaluate the model

In [None]:
my_model.evaluate(data_manager)

### Test the model

In [None]:
my_model.test("Das Wetter ist schön")

### Publish model to hugging-face

- This requires a hugging-face [User Access Token](https://huggingface.co/docs/hub/security-tokens)
- You can export this to the `HUGGING_FACE_TOKEN` environment variable and just call `publish()`
- Or directly pass the token with `publish(hugging_face_token="abc123")`

In [None]:
urls = my_model.publish()

In [None]:
urls

### Load an existing model

An existing model can be loaded from its `model_path`:

In [None]:
loaded_model = SpacyModelManager("my_model")
print(loaded_model)