In [2]:
import spacy

nlp = spacy.load('da_core_news_sm')

tokenizer = nlp.tokenizer

list(tokenizer('Politiet har givet borgerne råd'))

[Politiet, har, givet, borgerne, råd]

In [15]:
def tokenizer(text):
    
    custom_stops = [""] # Definerer kontekstspecifikke stopord
    default_stopwords = list(nlp.Defaults.stop_words) # Indlæser prædefineret stopordsliste
    stop_words = default_stopwords + custom_stops # Danner samlet stopordsliste
    
    pos_tags = ['PROPN', 'ADJ', 'NOUN', 'VERB'] # Definerer POS-tags som skal bevares: egenavne, adjektiver og navneord

    doc = nlp(text)

    tokens = []

    for word in doc: # Looper igennem hvert ord i tweet
        if (len(word.lemma_) < 3): # Ord må ikke være mindre end 3 karakterer - går videre til næste ord, hvis det er
            continue
        if (word.pos_ in pos_tags) and (word.lemma_ not in stop_words): # Tjek at ordets POS-tag indgår i listen af accepterede tags og at ordet ikke er stopord
            tokens.append(word.lemma_) # Tilføj ordets lemma til tokens, hvis if-betingelse er opfyldt
                
    return(tokens)

tokenizer('Politiet har givet borgerne råd')

['politi', 'give', 'borger', 'råd']

In [16]:
texts = ['Vi er utroligt beærede',
         'Vi vil gerne dele prisen med alle',
         'Det skal vi have gjort op med.']

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() # dan vectorizerfunktion
transformed_documents = vectorizer.fit_transform(texts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
count_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

count_df

Unnamed: 0,alle,beærede,dele,det,er,gerne,gjort,have,med,op,prisen,skal,utroligt,vi,vil
0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0
1,1,0,1,0,0,1,0,0,1,0,1,0,0,1,1
2,0,0,0,1,0,0,1,1,1,1,0,1,0,1,0


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() # dan vectorizerfunktion
transformed_documents = vectorizer.fit_transform(texts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
tfidf_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

tfidf_df

Unnamed: 0,alle,beærede,dele,det,er,gerne,gjort,have,med,op,prisen,skal,utroligt,vi,vil
0,0.0,0.546454,0.0,0.0,0.546454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546454,0.322745,0.0
1,0.410747,0.0,0.410747,0.0,0.0,0.410747,0.0,0.0,0.312384,0.0,0.410747,0.0,0.0,0.242594,0.410747
2,0.0,0.0,0.0,0.410747,0.0,0.0,0.410747,0.410747,0.312384,0.410747,0.0,0.410747,0.0,0.242594,0.0


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer = tokenizer) # dan vectorizerfunktion med egen tokenizer
transformed_documents = vectorizer.fit_transform(texts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
tfidf_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

tfidf_df

Unnamed: 0,beære,dele,pris
0,1.0,0.0,0.0
1,0.0,0.707107,0.707107
2,0.0,0.0,0.0
