In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
vect = CountVectorizer(binary=True)
corpus = [
    "I have a German Shepherd",
    "German Shepherd is from Germany",
    "Germans love gossiping",
]
vect.fit(corpus)


CountVectorizer(binary=True)

In [3]:
vocab = vect.vocabulary_
for key in sorted(vocab.keys()):
    print("{}:{}".format(key, vocab[key]))


from:0
german:1
germans:2
germany:3
gossiping:4
have:5
is:6
love:7
shepherd:8


In [4]:
print(vect.transform(["Germany has German Shepherd"]).toarray())


[[0 1 0 1 0 0 0 0 1]]


In [5]:
similarity = cosine_similarity(
    vect.transform(["Germany has German Shepherd"]).toarray(),
    vect.transform(["Germany has Berlin as capital"]).toarray(),
)
print(similarity)


[[0.57735027]]


In [6]:
vectorizer = TfidfVectorizer(binary=True)


In [20]:
e1 = ("Far out, there pealed a whisper of frightful laughter. And again,"
    "closer this time. No human being ever laughed like that—there was no mirth in it,"
    " only hatred and horror and soul-destroying terror. Kane halted. He was not afraid, "
    "but for the second he was almost unnerved. Then, stabbing through that awesome laughter, "
    "came the sound of a scream that was undoubtedly human. Kane started forward, increasing his gait."
    " He cursed the illusive lights and flickering shadows which veiled the moor in the rising moon and "
    "made accurate sight impossible. The laughter continued, growing louder, as did the screams."
    "Then sounded faintly the drum of frantic human feet. Kane broke into a run."
)
e2 = (
    "Then the noise of a terrible and short struggle came clearly through the abysmal silence of the"
    " fen and the footfalls began again, but stumbling and uneven. The screaming continued, "
    "but with a gasping gurgle. The sweat stood cold on Kane's forehead and body. This was "
    "heaping horror on horror in an intolerable manner."
)
e3 = (
    "The moon was up now and the light was better. Kane bent above the body, which lay stark "
    "in its unnamable mutilation, and he shuddered—a rare thing for him, who had seen the deeds of "
    "the Spanish Inquisition and the witch-finders."
)
e4 = (
    "Of one thing Kane was sure: there would be no hunting of him across the dreary moors, "
    "no screaming and fleeing to be dragged down again and again. If he must die he would die "
    "in his tracks, his wounds in front."
)
e5 = (
    "\"Aye, aye!\" muttered old Ezra hurriedly; \"a bad thing, a bad thing! Yet why do you tell this thing to me?\""
)
e6 = (
    "On the instant, at Kane's sharp order, two brawny villagers sprang forward and seized the miser. "
    "They twisted the dagger from his withered hand, and pinioned his arms, shuddering as their fingers "
    "encountered his clammy flesh."
)
e7 = (
    "The sun was sliding down the horizon and old Ezra stared at it with bulging eyes—stared as "
    "if he could not gaze enough. Far out on the moors reared up the great oak tree, like a gibbet, "
    "now only a decaying shell. There Solomon Kane halted."
)
e8 = (
    "Kane spoke a few words to an agile villager. The youth clambered up the rotting bole of the tree and"
    "from a crevice, high up, dragged something that fell with a clatter at the feet of the miser. Ezra went "
    "limp with a terrible shriek."
)
e9 = (
    "They walked away across the fen, and Kane flung a last look at the grotesque form bound to the tree, "
    "seeming in the uncertain light like a great fungus growing to the bole. And suddenly the miser screamed hideously:"
)
e10 = (
    "Nothing could be seen. The moor was an ocean of shadows and the tall grass about them bent in long waves "
    "before the faint wind, breaking the deathly stillness with breathless murmurings."
)


In [21]:
edit_corpus = [e1, e2, e3, e4, e5, e6, e7, e8, e9, e10]
vectorizer.fit(edit_corpus)


TfidfVectorizer(binary=True)

In [22]:
vocabulary = vectorizer.vocabulary_
for key in sorted(vocabulary.keys()):
    print("{}:{}".format(key, vocabulary[key]))


about:0
above:1
abysmal:2
accurate:3
across:4
afraid:5
again:6
agile:7
almost:8
an:9
and:10
andfrom:11
arms:12
as:13
at:14
away:15
awesome:16
aye:17
bad:18
be:19
before:20
began:21
being:22
bent:23
better:24
body:25
bole:26
bound:27
brawny:28
breaking:29
breathless:30
broke:31
bulging:32
but:33
came:34
clambered:35
clammy:36
clatter:37
clearly:38
closer:39
cold:40
continued:41
could:42
crevice:43
cursed:44
dagger:45
deathly:46
decaying:47
deeds:48
destroying:49
did:50
die:51
do:52
down:53
dragged:54
dreary:55
drum:56
encountered:57
enough:58
ever:59
eyes:60
ezra:61
faint:62
faintly:63
far:64
feet:65
fell:66
fen:67
few:68
finders:69
fingers:70
fleeing:71
flesh:72
flickering:73
flung:74
footfalls:75
for:76
forehead:77
form:78
forward:79
frantic:80
frightful:81
from:82
front:83
fungus:84
gait:85
gasping:86
gaze:87
gibbet:88
grass:89
great:90
grotesque:91
growing:92
gurgle:93
had:94
halted:95
hand:96
hatred:97
he:98
heaping:99
hideously:100
high:101
him:102
his:103
horizon:104
horror:105
h

In [23]:
new1 = (
  "Semantic Segmentation is a computer vision task in which the goal is to categorize each pixel in an image"
  "into a class or object. The goal is to produce a dense pixel-wise segmentation map of an image, where each pixel"
)
new2 = (
 "is assigned to a specific class or object. Some example benchmarks for this task are Cityscapes," 
 "PASCAL VOC and ADE20K. Models are usually evaluated with the Mean Intersection-Over-Union (Mean IoU) and Pixel Accuracy metrics."
)


In [24]:
for i in range(10):
    e = edit_corpus[i]
    print(
        cosine_similarity(
            vectorizer.transform([e]).toarray(), vectorizer.transform([new1]).toarray()
        )
    )


[[0.11320528]]
[[0.15002674]]
[[0.18507694]]
[[0.16023786]]
[[0.07796325]]
[[0.02067132]]
[[0.01930123]]
[[0.18295401]]
[[0.11987149]]
[[0.17389661]]


In [25]:
for i in range(10):
    e = edit_corpus[i]
    print(
        cosine_similarity(
            vectorizer.transform([e]).toarray(), vectorizer.transform([new2]).toarray()
        )
    )


[[0.12518641]]
[[0.15492589]]
[[0.13550272]]
[[0.10736392]]
[[0.17454684]]
[[0.04498294]]
[[0.09286443]]
[[0.13017592]]
[[0.10634185]]
[[0.1060747]]
