In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import spacy

nlp = spacy.load("pl_core_news_md")

# Podobieństwo tekstu

## Podobieństwo dokumentów

```
wget https://klejbenchmark.com/static/data/klej_psc.zip
unzip klej_psc.zip -d klej_psc
```

In [2]:
df = pd.read_csv("klej_psc/train.tsv", sep="\t", nrows=6)
df

Unnamed: 0,extract_text,summary_text,label
0,Prywatna spółka KrzysztofaToeplitza od siedmiu...,W piątek w wielu uczelniach odbyły się uroczys...,0
1,"Jeżeli HP nie przyjmie propozycji ARiMR, zakoń...",Doszło do konfliktu pomiędzy Agencją Restruktu...,1
2,W latach 90. wielkość rosyjskiego Produktu Kra...,"Trybunał Konstytucyjny orzekł, że posłom i sen...",0
3,"Mit o potopie jest prastary, sięga czasów, gdy...",Dwójka amerykańskich geofizyków przedstawiła s...,1
4,Po zakończeniu obrad każdy poseł ma prawo wygł...,74 posłów AWS-u podpisało wniosek o wotum nieu...,0
5,"Zębami, pazurami i dolarami bronią się związki...",Od kilkunastu lat poziom nauczania w amerykańs...,1


In [3]:
doc_corpus1 = list(nlp.pipe(df["extract_text"], disable=["ner"]))
doc_corpus2 = list(nlp.pipe(df["summary_text"], disable=["ner"]))
flags = list(df["label"])

In [4]:
norm_corpus1 = [[token.lemma_ for token in doc if token.is_alpha and not token.is_stop] for doc in doc_corpus1]
norm_corpus2 = [[token.lemma_ for token in doc if token.is_alpha and not token.is_stop] for doc in doc_corpus2]

### Jaccard

In [5]:
def jaccard(doc1, doc2):
    a = set(doc1) 
    b = set(doc2)
    c = a.intersection(b)
    d = a.union(b)
    return len(c) / len(d)

In [6]:
x = ["kot", "w", "butach"]
y = ["kot", "bez", "butów"]
jaccard(x, y)

0.2

In [7]:
jaccard_sim = [jaccard(doc1, doc2) for doc1, doc2 in zip(norm_corpus1, norm_corpus2)]

In [8]:
list(zip(flags, jaccard_sim))

[(0, 0.013333333333333334),
 (1, 0.09230769230769231),
 (0, 0.0),
 (1, 0.09375),
 (0, 0.04225352112676056),
 (1, 0.15384615384615385)]

In [9]:
selected_doc = norm_corpus1[-1]

In [10]:
[jaccard(selected_doc, doc) for doc in norm_corpus2]

[0.010869565217391304,
 0.0,
 0.0,
 0.011235955056179775,
 0.010869565217391304,
 0.15384615384615385]

### Cosinus

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vect = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

In [13]:
dtm = vect.fit_transform(norm_corpus1 + norm_corpus2)

In [14]:
all_sim = cosine_similarity(dtm)

In [15]:
pd.DataFrame(all_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.009419,0.0,0.0,0.008452,0.013616,0.12508,0.0,0.009996,0.033292,0.016871
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.167795,0.025417,0.0,0.0,0.025701
2,0.009419,0.0,1.0,0.019392,0.0,0.037528,0.014121,0.0,0.0,0.026859,0.017264,0.029915
3,0.0,0.0,0.019392,1.0,0.0,0.0,0.017623,0.0,0.0,0.160124,0.0,0.043089
4,0.0,0.0,0.0,0.0,1.0,0.010556,0.01327,0.0,0.023082,0.0,0.064894,0.0
5,0.008452,0.0,0.037528,0.0,0.010556,1.0,0.020415,0.0,0.0,0.018604,0.009738,0.349213
6,0.013616,0.0,0.014121,0.017623,0.01327,0.020415,1.0,0.0,0.0,0.0,0.027931,0.0
7,0.12508,0.167795,0.0,0.0,0.0,0.0,0.0,1.0,0.012815,0.034417,0.032073,0.010112
8,0.0,0.025417,0.0,0.0,0.023082,0.0,0.0,0.012815,1.0,0.0,0.042587,0.0
9,0.009996,0.0,0.026859,0.160124,0.0,0.018604,0.0,0.034417,0.0,1.0,0.0,0.039137


In [16]:
idxs = [(0 + x, 6 + x) for x in range(6)]
idxs

[(0, 6), (1, 7), (2, 8), (3, 9), (4, 10), (5, 11)]

In [17]:
cos_sim = [all_sim[idx] for idx in idxs]

In [18]:
list(zip(flags, cos_sim))

[(0, 0.013615789834667096),
 (1, 0.1677947851062898),
 (0, 0.0),
 (1, 0.16012365568505488),
 (0, 0.06489428267554186),
 (1, 0.34921255708390936)]

In [19]:
spacy_sim = [doc1.similarity(doc2) for doc1, doc2 in zip(doc_corpus1, doc_corpus2)]

In [20]:
list(zip(flags, spacy_sim))

[(0, 0.8191757812128119),
 (1, 0.827396244184051),
 (0, 0.8624696855667517),
 (1, 0.8993076697228352),
 (0, 0.7899082251076535),
 (1, 0.9379730011285272)]

In [21]:
vects1 = [[token.vector for token in doc if token.is_alpha and not token.is_stop and token.has_vector] 
          for doc in doc_corpus1]
vects2 = [[token.vector for token in doc if token.is_alpha and not token.is_stop and token.has_vector] 
          for doc in doc_corpus2]

In [22]:
avg_vects1 = [np.mean(doc, axis=0) for doc in vects1]
avg_vects2 = [np.mean(doc, axis=0) for doc in vects2]

In [23]:
all_vect_sim = cosine_similarity(avg_vects1 + avg_vects2)

In [24]:
pd.DataFrame(all_vect_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.727065,0.559986,0.456411,0.546333,0.564693,0.654282,0.635772,0.707998,0.552629,0.569953,0.610395
1,0.727065,1.0,0.712642,0.470924,0.629901,0.717068,0.805417,0.825911,0.842312,0.674826,0.728975,0.782894
2,0.559986,0.712642,1.0,0.689905,0.613194,0.721103,0.733334,0.484476,0.749476,0.739505,0.75306,0.745628
3,0.456411,0.470924,0.689905,1.0,0.63464,0.535061,0.53817,0.268442,0.524662,0.804749,0.495772,0.592939
4,0.546333,0.629901,0.613194,0.63464,1.0,0.672023,0.679972,0.441707,0.691945,0.668908,0.659147,0.688095
5,0.564693,0.717068,0.721103,0.535061,0.672023,1.0,0.850444,0.514165,0.835729,0.709892,0.645347,0.910529
6,0.654282,0.805417,0.733334,0.53817,0.679972,0.850444,1.0,0.627489,0.884137,0.721638,0.66625,0.894828
7,0.635772,0.825911,0.484476,0.268442,0.441707,0.514165,0.627489,1.0,0.666621,0.495134,0.523677,0.606073
8,0.707998,0.842312,0.749476,0.524662,0.691945,0.835729,0.884137,0.666621,1.0,0.724549,0.743104,0.895896
9,0.552629,0.674826,0.739505,0.804749,0.668908,0.709892,0.721638,0.495134,0.724549,1.0,0.596961,0.772272


In [25]:
vect_sim = [all_vect_sim[idx] for idx in idxs]

In [26]:
list(zip(flags, vect_sim))

[(0, 0.6542816),
 (1, 0.8259107),
 (0, 0.7494765),
 (1, 0.80474937),
 (0, 0.6591472),
 (1, 0.910529)]

In [27]:
list(zip(flags, spacy_sim))

[(0, 0.8191757812128119),
 (1, 0.827396244184051),
 (0, 0.8624696855667517),
 (1, 0.8993076697228352),
 (0, 0.7899082251076535),
 (1, 0.9379730011285272)]