Skip to content

Commit

Permalink
RFC: Use stanza model for Finnish
Browse files Browse the repository at this point in the history
  • Loading branch information
rominf committed May 10, 2024
1 parent 82bda29 commit ad4c5a2
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
8 changes: 6 additions & 2 deletions docker/PythonDockerfileDev
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ RUN apt-get update -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN pip install torch --index-url https://download.pytorch.org/whl/cpu

RUN pip install -U --no-cache-dir \
setuptools \
wheel \
Expand All @@ -22,6 +24,8 @@ RUN pip install -U --no-cache-dir \
bottle \
#spacy
spacy \
#stanza integration for spacy
spacy-stanza \
#chinese reading
pinyin \
#subtitle file parser
Expand All @@ -33,7 +37,6 @@ RUN python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download nb_core_news_sm \
&& python3 -m spacy download es_core_news_sm \
&& python3 -m spacy download nl_core_news_sm \
&& python3 -m spacy download fi_core_news_sm \
&& python3 -m spacy download fr_core_news_sm \
&& python3 -m spacy download it_core_news_sm \
&& python3 -m spacy download sv_core_news_sm \
Expand All @@ -48,5 +51,6 @@ RUN python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download pt_core_news_sm \
&& python3 -m spacy download ro_core_news_sm \
&& python3 -m spacy download sl_core_news_sm \
&& python3 -m spacy download xx_ent_wiki_sm
&& python3 -m spacy download xx_ent_wiki_sm \
&& python3 -c 'import stanza; stanza.download("fi")'

3 changes: 2 additions & 1 deletion tools/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import shutil
import subprocess
from newspaper import Article
import spacy_stanza

# create emtpy sapce models
multi_nlp = None
Expand Down Expand Up @@ -122,7 +123,7 @@ def getTokenizerDoc(language, words):
if language == 'finnish':
global finnish_nlp
if finnish_nlp == None:
finnish_nlp = spacy.load("fi_core_news_sm", disable = ['ner', 'parser'])
finnish_nlp = spacy_stanza.load_pipeline("fi", processors="tokenize,lemma")
finnish_nlp.add_pipe("custom_sentence_splitter", first=True)
doc = finnish_nlp(words)

Expand Down

0 comments on commit ad4c5a2

Please sign in to comment.