# Compression

In [1]:
import gzip

txt_1 = "hello world"
txt_2 = "some text some text some text"

In [2]:
len(gzip.compress(txt_1.encode()))

31

In [3]:
len(gzip.compress(txt_2.encode()))

33

In [4]:
len(gzip.compress(" ".join([txt_1, txt_2]).encode()))

43

In [5]:
len(gzip.compress(" ".join([txt_1, txt_1]).encode()))

34

In [6]:
len(gzip.compress(" ".join([txt_2, txt_2]).encode()))

33

# Tie breaking

Original code always selects index with lowest label in case of a tie

In [7]:
top_k_class = [0, 1]
max(set(top_k_class), key=top_k_class.count)

0

In [8]:
top_k_class = [1, 0]
max(set(top_k_class), key=top_k_class.count)

0

In [9]:
top_k_class = [1, 0, 2]
max(set(top_k_class), key=top_k_class.count)

0

We can prevent this using Counter, which selects the first label in case of a tie. If labels are sorted by distance, we can ensure it's picking the closest neighbor in case of a tie, which is a more reasonable choice than always selecting the lowest-index class:

In [10]:
from collections import Counter

In [11]:
top_k_class = [0, 1]

Counter(top_k_class).most_common()[0][0]

0

In [12]:
top_k_class = [1, 0]

Counter(top_k_class).most_common()[0][0]

1

In [16]:
top_k_class = [1, 2, 0]

Counter(top_k_class).most_common()[0][0]

1

In [14]:
top_k_class = [1, 0, 2, 2]

Counter(top_k_class).most_common()[0][0]

2

### Count vectors

In [19]:
import numpy as np

text_1 = np.array([0., 3., 1.]) 
text_2 = np.array([0., 3., 1.])

text_1 /= np.sum(text_1)
text_2 /= np.sum(text_2)

print(text_1)
print(text_2)

added = text_1 + text_2

print(added / np.sum(added))

[0.   0.75 0.25]
[0.   0.75 0.25]
[0.   0.75 0.25]
