In [1]:
from collections import Counter
import pandas as pd
import spacy

nlp = spacy.load("pl_core_news_md")

In [2]:
corpus = ["Przetwarzanie tekstu to świetna zabawa", 
          "Lubię pracować z tekstem", 
          "Python świetnie nadaje się do przetwarzania tekstu"]
doc_corpus = list(nlp.pipe(corpus, disable=["ner"]))
norm_corpus = [[token.lemma_ for token in doc if token.is_alpha] for doc in doc_corpus]

In [3]:
stopwords = set(pd.read_csv("https://raw.githubusercontent.com/bieli/stopwords/master/polish.stopwords.txt", 
                            header=None).values[:,0])
sw_norm_corpus = [[token for token in doc if token not in stopwords] for doc in norm_corpus]

In [4]:
def make_dtm(corpus, uniq_tokens):
    out = []
    for text in corpus:
        c = Counter(text)
        dt = []
        for token in uniq_tokens:
            dt.append(c.get(token, 0))
        out.append(dt)
    return out

uniq_tokens = list(set([word for text in sw_norm_corpus for word in text]))
bow_dtm = make_dtm(sw_norm_corpus, uniq_tokens)

# Reprezentacja tekstu

In [5]:
pd.DataFrame(bow_dtm, columns=uniq_tokens)

Unnamed: 0,lubić,zabawa,pracować,tekst,świetnie,python,nadawać,przetwarzanie,świetny
0,0,1,0,1,0,0,0,1,1
1,1,0,1,1,0,0,0,0,0
2,0,0,0,1,1,1,1,1,0


```
pip install scikit-learn
```

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
corpus

['Przetwarzanie tekstu to świetna zabawa',
 'Lubię pracować z tekstem',
 'Python świetnie nadaje się do przetwarzania tekstu']

In [8]:
vect = CountVectorizer()

In [9]:
dtm = vect.fit_transform(corpus)

In [10]:
dtm

<3x14 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [11]:
dtm.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1]])

In [12]:
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,do,lubię,nadaje,pracować,przetwarzania,przetwarzanie,python,się,tekstem,tekstu,to,zabawa,świetna,świetnie
0,0,0,0,0,0,1,0,0,0,1,1,1,1,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,1,0,1,1,0,1,0,0,0,1


In [13]:
sw_norm_corpus

[['przetwarzanie', 'tekst', 'świetny', 'zabawa'],
 ['lubić', 'pracować', 'tekst'],
 ['python', 'świetnie', 'nadawać', 'przetwarzanie', 'tekst']]

In [14]:
vect = CountVectorizer()

In [15]:
dtm = vect.fit_transform(sw_norm_corpus)

AttributeError: 'list' object has no attribute 'lower'

In [16]:
vect = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

In [17]:
dtm = vect.fit_transform(sw_norm_corpus)

In [18]:
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,lubić,nadawać,pracować,przetwarzanie,python,tekst,zabawa,świetnie,świetny
0,0,0,0,1,0,1,1,0,1
1,1,0,1,0,0,1,0,0,0
2,0,1,0,1,1,1,0,1,0


## TF-IDF

**tf** - term frequency -> BoW <br>
**idf** - inverse document frequency -> $idf(t) = 1 + log(\frac{1 + n_{doc}}{1 + df(t)})$

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vect = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

In [21]:
dtm = vect.fit_transform(sw_norm_corpus)

In [22]:
pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,lubić,nadawać,pracować,przetwarzanie,python,tekst,zabawa,świetnie,świetny
0,0.0,0.0,0.0,0.444514,0.0,0.345205,0.584483,0.0,0.584483
1,0.652491,0.0,0.652491,0.0,0.0,0.385372,0.0,0.0,0.0
2,0.0,0.504611,0.0,0.38377,0.504611,0.298032,0.0,0.504611,0.0
