<a href="https://colab.research.google.com/github/sayanbanerjee32/danish_nlp/blob/main/sentence_similarity_collab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Danish Sentence similarity

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# import os
# os.chdir('D:/DataScienceWorkSpace/danish_sentence_similarty/src')

In [2]:
# data fie path in local
data_file = 'https://raw.githubusercontent.com/lassehjorthmadsen/data-science-assignment/master/data/sentences.csv'
#'../data/sentences.csv'

In [3]:
# read the sentences in a dataframe
sentence_df = pd.read_csv(data_file)
sentence_df.head()

Unnamed: 0,id,text
0,10-12-176,"Vanddamp er en usynlig gas, der forekommer i s..."
1,10-13-182,Er der nogen herinde der har erfaring med at k...
2,10-14-29,Ved ikke lige hvordan disse er i størrelsen?
3,10-16-39,Dog kan jeg godt lide pang farver;)
4,10-17-297,Pengene bliver dog ofte først udbetalt efter 5...


In [4]:
# target sentence for finding similar senences
target_sentence = sentence_df.loc[sentence_df['id']=='7-21-440','text']
target_sentence.values[0]

'Ifølge Dansk Kennelklub angriber muskelhunde dyr og mennesker cirka hver 14. dag.'

English translation for this sentence - __According to the Danish Kennel Club, muscular dogs attack animals and humans approximately every 14 days.__

## Option 1: Toenization > cleaning > Count vectorizer / TF-IDF vectorizer > Similarity

In [7]:
!pip install clean-text

Collecting clean-text
  Downloading https://files.pythonhosted.org/packages/78/30/7013e9bf37e00ad81406c771e8f5b071c624b8ab27a7984cd9b8434bed4f/clean_text-0.3.0-py3-none-any.whl
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 4.9MB/s 
[?25hCollecting ftfy<6.0,>=5.8
[?25l  Downloading https://files.pythonhosted.org/packages/ff/e2/3b51c53dffb1e52d9210ebc01f1fb9f2f6eba9b3201fa971fd3946643c71/ftfy-5.8.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 4.9MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-5.8-cp36-none-any.whl size=45613 sha256=f9b688f9c8806d29db57f523a085af4e05265f85909003268bf24cc8bb3f0f5a
  Stored in directory: /root/.cache/pip/wheels/ba/c0/ef/f28c4da5ac84a4e06ac256ca9182fc34fa57fefff

In [8]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import DanishStemmer


from cleantext import clean

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [10]:
# nltk.download('punkt')
# nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
# tokenization function with Danish language Stemmer
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(DanishStemmer().stem(item))
    return stems

# clean text by lowering abd removing email, number, url, currency, punctuation, 
def normalize_text(text):
    return clean(text,
     no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_currency_symbol="",
#     replace_with_punct=" ~PUNCT~ ",          # instead of removing punctuations you may replace them
#     replace_with_url=" ~URL~ ",
#     replace_with_email=" ~EMAIL~ ",
#     replace_with_phone_number=" ~PHONE~ ",
#     replace_with_number=" ~NUMBER~ ",
#     replace_with_currency_symbol=" ~CUR~ ",
    lang="de") # german is used as assumed that german would be closeer to Danish that english

# sample output for the target text
tokenize(normalize_text(target_sentence.values[0]))
#normalize_text(target_sentence.values[0])

'iflge dansk kennelklub angriber muskelhunde dyr og mennesker cirka hver dag'

In [12]:
# create a replica of the sentence dataframe for processing
sentence_df_cp_op1 = sentence_df.copy()
sentence_df_cp_op1 = sentence_df_cp_op1.text.apply(normalize_text)
sentence_df_cp_op1

0       vanddamp er en usynlig gas der forekommer i st...
1       er der nogen herinde der har erfaring med at k...
2              ved ikke lige hvordan disse er i strrelsen
3                       dog kan jeg godt lide pang farver
4       pengene bliver dog ofte frst udbetalt efter da...
                              ...                        
4995    i dag i idrt skulle vi sa have bip test hvor j...
4996    p men tnkt nu hvis de bragte mere fra danskspr...
4997                             kathani hejsa allesammen
4998    weeeeeee jeg har mega optur pa de var begge mi...
4999    archon der blev jo ikke sagt noget om hvem der...
Name: text, Length: 5000, dtype: object

### Cosine similarity on count vectorizer

In [36]:
stopwords_da = stopwords.words('danish')
stopwords_da.append('dag')
stopwords_da.append('cirka')
stopwords_da.append('hver')
stopwords_da.append('iflge')
stopwords_da.append('dansk')

In [37]:
# fit count vectorizer with Danish stop words
# use the custom tokenizer with Danish Stemmer
count_vect = CountVectorizer(tokenizer=tokenize, stop_words=stopwords_da)
# fit on cleaned text
count_vect_matrix = count_vect.fit_transform(sentence_df_cp_op1)
# vocab sze 13574
count_vect_matrix.shape

  'stop_words.' % sorted(inconsistent))


(5000, 13570)

In [38]:
# find cosine similarity for the count vector matrix
cosine_matrix_cv = cosine_similarity(count_vect_matrix)

In [39]:
# convert to Data Frame
cosine_df_cv = pd.DataFrame(cosine_matrix_cv, columns = sentence_df['id'], index = sentence_df['id'])

In [40]:
# select the row for target sentence
target_cosine_array = cosine_df_cv.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
45-86-368     0.288675
34-2-723      0.235702
40-81-2158    0.218218
36-90-2811    0.204124
35-72-306     0.204124
51-3-3235     0.198030
6-37-308      0.198030
28-58-176     0.192450
42-96-175     0.182574
Name: 7-21-440, dtype: float64

In [41]:
# gather indices for top 10 similar sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [42]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'Hvad er dyrets yndlingsfoder?'

In [43]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'Hvad er det dog, vi mennesker gør ved dig?'

In [44]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Ifølge Henning Otte Hansen må forbrugerne herhjemme nu nok under alle omstændigheder affinde sig med, at maden er relativt dyr:'

In [45]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'De seneste 10 år har der ifølge Søfartsstyrelsen'

In [46]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'I livet indførte mennesket som hjerne'

### Cosine similarity on TF-IDF vectorizer

In [None]:
# fit tf-idf vectorizer with Danish stop words
# use the custom tokenizer with Danish Stemmer
tfidf_vect = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_da)
# fit on cleaned text
tfidf_vect_matrix = tfidf_vect.fit_transform(sentence_df_cp_op1)
tfidf_vect_matrix.shape

  'stop_words.' % sorted(inconsistent))


(5000, 13571)

In [None]:
# pairwise cosine similarity for tf-idf vector matrix
cosine_matrix_ti = cosine_similarity(tfidf_vect_matrix)

In [None]:
# convert to DF
cosine_df_ti = pd.DataFrame(cosine_matrix_ti, columns = sentence_df['id'], index = sentence_df['id'])

In [None]:
# select row for target sentence
target_cosine_array = cosine_df_ti.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
45-86-368     0.205748
38-78-1469    0.191179
34-2-723      0.190701
40-81-2158    0.176440
51-3-3235     0.170464
36-90-2811    0.167261
28-58-176     0.166315
38-92-376     0.160373
51-34-3098    0.154647
Name: 7-21-440, dtype: float64

In [None]:
# Indices for top 10 similr sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'Hvad er dyrets yndlingsfoder?'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'Men det fik ikke den farlige angriber til at skåne City.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Hvad er det dog, vi mennesker gør ved dig?'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'Ifølge Henning Otte Hansen må forbrugerne herhjemme nu nok under alle omstændigheder affinde sig med, at maden er relativt dyr:'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'Men indtil nu er alle teorierne kommet mere eller mindre til kort, da man hele tiden kan finde dyr, der har noget, der ligner menneskelige egenskaber.'

## Option 2: spaCy word embedding vector for sentence similarity

In [None]:
import spacy

In [None]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("da_core_news_md")

In [None]:
# create replica for cleaned text df from last section
sentence_df_cp_op2 = sentence_df_cp_op1.copy()
sentence_df_cp_op2 = pd.DataFrame(sentence_df_cp_op2.values, columns = ['text'], index = sentence_df['id'])
sentence_df_cp_op2.head() 

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
10-12-176,vanddamp er en usynlig gas der forekommer i st...
10-13-182,er der nogen herinde der har erfaring med at k...
10-14-29,ved ikke lige hvordan disse er i strrelsen
10-16-39,dog kan jeg godt lide pang farver
10-17-297,pengene bliver dog ofte frst udbetalt efter da...


In [None]:
# nlp pipeline for target sentence
base = nlp(sentence_df_cp_op2.loc['7-21-440',:].values[0])

In [None]:
# custom function for similarity scoring between 2 sentences
def calculate_similarity(text2):
    compare = nlp(text2)
    return base.similarity(compare)

In [None]:
# apply on each row
sentence_df_cp_op2['sim_score'] = sentence_df_cp_op2['text'].apply(calculate_similarity)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# sort descending
sentence_df_cp_op2.sort_values(by = ['sim_score'], ascending = False, inplace = True)

In [None]:
# top 10 similar sentences, index 0 being the sentence itself
top_10 = sentence_df_cp_op2.iloc[:10,:]['text']

In [None]:
top_10[1]

'nar man kommer slbende med vrkende kn og ildrd nse efter timers vandring gr det godt at stte sig til bordet sammen med de andre gster og bliver krset for i en fransk bjerghytte'

In [None]:
top_10[2]

'stadig flere danskere fatter interesse for den kontante sportsgren hvor veltrnede mnd pa langt over kilo kaster sig efter hinanden'

In [None]:
top_10[3]

'det tager i hvert fald en hel dag at overvre indmarchen af de snorlige rkker af musketerer farverige flagkastere grnne skytter krigere med armbrster byens skyttedronning og de forskellige madvogne som helt bogstaveligt smider brd og plser i hovedet pa publikum'

In [None]:
top_10[4]

'og nok stter to unge mnd sig pa fortovscafeen og far en kop kaffe og taler arabisk men det er cafe latte de taler ogsa dansk og de har parkeret en stor kassevogn foran cafeen'

In [None]:
top_10[5]

'hver lrdag klokken mdes en gruppe kinesiske forldre og deres brn i little mermaid chinese culture school der startede som et privat initiativ i mens forldrene sludrer og dyrker tai chi far brnene undervisning i kinesisk sprog og kultur i de lante klasselokaler'

## Option 3: BERT embedding for Danish > Similarity

In [None]:
# BERT based sentence embedding for Danish
from danish_bert_embeddings import DanishBertEmbeddings
embedder = DanishBertEmbeddings()

In [None]:
# sample embedding
embedding = embedder.embed(target_sentence.values[0])

In [None]:
embedding.shape

torch.Size([768])

In [None]:
# convert to embedding for each sentence
sentence_df_embed = sentence_df['text'].apply(embedder.embed)

In [None]:
# convert to np array
sentence_embed_list = [t.numpy() for t in sentence_df_embed]

### Cosine similarity

In [None]:
# pairewise cosine similarity between sentence embeddings
cosine_matrix = cosine_similarity(sentence_embed_list)

In [None]:
# convert to dataframe
cosine_df = pd.DataFrame(cosine_matrix, columns = sentence_df['id'], index = sentence_df['id'])

In [None]:
# select the row for target sentence
target_cosine_array = cosine_df.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
38-21-615     0.713109
36-0-2615     0.705893
6-37-308      0.705858
51-3-3235     0.699858
36-7-877      0.694900
40-44-510     0.693219
3-28-67       0.692790
38-59-1673    0.692642
51-34-3098    0.692097
Name: 7-21-440, dtype: float32

In [None]:
# gather indices for top 10 similr sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'Han er fossilekspert, og foruden at være museumsinspektør ved Geomuseum Faxe er han forsker ved Københavns Universitet:" Over revet var der 200- 400 meter havvand, og Thoracosaurus har- akkurat som nulevende havkrokodiller- jagtet sit bytte i de øvre vandlag.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'På 24 timer mellem 15. og 16. april 1949 fløj 1. 398 maskiner i alt 12. 849 tons fragt ind til den isolerede storby'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Hovedårsagen til, at Katrina blev så dyr, at skaderne blev så omfangsrige, er ganske enkelt, at der i dag bor langt flere mennesker i kystområderne end tidligere.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'Men indtil nu er alle teorierne kommet mere eller mindre til kort, da man hele tiden kan finde dyr, der har noget, der ligner menneskelige egenskaber.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'Børshandlere er såvidt vides udstyret med samme biologiske profil som alle andre mennesker.'

## Option 4: Sub word tokenization > TF-IDF vercorization > cosine similarity

In [None]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 13.6MB/s eta 0:00:01[K     |▌                               | 20kB 19.1MB/s eta 0:00:01[K     |▉                               | 30kB 12.6MB/s eta 0:00:01[K     |█                               | 40kB 10.8MB/s eta 0:00:01[K     |█▍                              | 51kB 6.9MB/s eta 0:00:01[K     |█▋                              | 61kB 7.4MB/s eta 0:00:01[K     |██                              | 71kB 7.9MB/s eta 0:00:01[K     |██▏                             | 81kB 8.4MB/s eta 0:00:01[K     |██▌                             | 92kB 8.0MB/s eta 0:00:01[K     |██▊                             | 102kB 8.5MB/s eta 0:00:01[K     |███                             | 112kB 8.5MB/s eta 0:00:01[K     |███▎               

In [None]:
import sentencepiece as spm

In [None]:
# file for subword tokenization training
sub_word_file = 'sub_word_training.txt'

In [None]:
# cconsolidate all sentences in a file
all_sentences = '\n'.join(sentence_df['text'])
with open(sub_word_file, 'w') as _file:
    _file.write(all_sentences)

In [None]:
# train sentencepiece model from sub_word_file and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=sub_word_training.txt --model_prefix=m --vocab_size=10000')

In [None]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [None]:
# encode: text => id
print(sp.encode_as_pieces(target_sentence.values[0]))
print(sp.encode_as_ids(target_sentence.values[0]))

['▁I', 'følge', '▁Dansk', '▁Kenne', 'l', 'klub', '▁angriber', '▁mu', 'skel', 'hund', 'e', '▁dyr', '▁og', '▁mennesker', '▁cirk', 'a', '▁hver', '▁14', '.', '▁dag', '.']
[52, 458, 360, 3112, 185, 2736, 3224, 2246, 4454, 3534, 11, 1032, 7, 332, 1330, 66, 149, 511, 4, 121, 4]


### Cosine similarity on TF-IDF vectorizer

In [None]:
# fit TF-IDF vectorizer without Danish stop words
# use the sub word tokenizer with Danish Stemmer
tfidf_vect_sw = TfidfVectorizer(tokenizer=sp.encode_as_ids)#, stop_words=stopwords.words('danish'))
# fit on cleaned text
tfidf_vect_matrix_sw = tfidf_vect_sw.fit_transform(sentence_df_cp_op1)
# vocab sze 13574
tfidf_vect_matrix_sw.shape

(5000, 6394)

In [None]:
# find cosine similarity for the count vector matrix
cosine_matrix_ti_sw = cosine_similarity(tfidf_vect_matrix_sw)

In [None]:
# convert to Data Frame
cosine_df_ti_sw = pd.DataFrame(cosine_matrix_ti_sw, columns = sentence_df['id'], index = sentence_df['id'])

In [None]:
# select the row for target sentence
target_cosine_array = cosine_df_ti_sw.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
47-2-28       0.214849
51-58-1495    0.178410
34-7-768      0.167955
38-20-1059    0.153800
42-33-69      0.150390
3-55-8        0.148307
6-37-308      0.148256
38-78-1469    0.141425
6-47-994      0.140513
Name: 7-21-440, dtype: float64

In [None]:
# gather indices for top 10 similar sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'I dag'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'Paa Marken;'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Hvor forbrugernes valgmuligheder for 20 år siden kunne tælles på en hånd eller to, så kan de i dag vælge mellem cirka 150 forskellige realkreditlån.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'Jeg ved, at niveauet er højt blandt alle angriberne i Serie A og kræver en toppræstation hver eneste gang.'

In [None]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'KAN MAN SKÆRE MUGGEN VÆK?'

## Option 5: TOP2VEC for topic clustering with sentence transformer

In [None]:
#!pip install top2vec[sentence_transformers]

Collecting top2vec[sentence_transformers]
  Downloading https://files.pythonhosted.org/packages/6e/b0/7335cdddddd8036c0fc30f8aa6fb2170c7bcaf101fcf384a6dca6aa8dbde/top2vec-1.0.20-py3-none-any.whl
Collecting hdbscan
[?25l  Downloading https://files.pythonhosted.org/packages/22/2f/2423d844072f007a74214c1adc46260e45f034bb1679ccadfbb8a601f647/hdbscan-0.8.26.tar.gz (4.7MB)
[K     |████████████████████████████████| 4.7MB 7.0MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting joblib<1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/c9/f58220ac44a1592f79a343caba12f6837f9e0c04c196176a3d66338e1ea8/joblib-0.17.0-py3-none-any.whl (301kB)
[K     |████████████████████████████████| 307kB 41.6MB/s 
Collecting sentence-transformers; extra == "sentence_transformers"
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d831641497

In [None]:
from top2vec import Top2Vec

In [None]:
# topic modelling using BERT based sentence transformer
topic_model = Top2Vec(list(sentence_df.text),embedding_model='distiluse-base-multilingual-cased',speed="deep-learn")

2021-02-01 15:23:58,104 - top2vec - INFO - Pre-processing documents for training
2021-02-01 15:23:58,470 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
100%|██████████| 504M/504M [00:55<00:00, 9.12MB/s]
2021-02-01 15:25:05,350 - top2vec - INFO - Creating joint document/word embedding
2021-02-01 15:29:00,761 - top2vec - INFO - Creating lower dimension embedding of documents
2021-02-01 15:29:37,390 - top2vec - INFO - Finding dense areas of documents
2021-02-01 15:29:37,647 - top2vec - INFO - Finding topics


In [None]:
# total number of topics discovered
topic_model.get_num_topics()

39

In [None]:
# num of senteces in topic
topic_sizes, topic_nums = topic_model.get_topic_sizes()
topic_sizes, topic_nums

(array([412, 314, 279, 207, 207, 203, 202, 184, 158, 150, 135, 126, 126,
        126, 122, 122, 115, 115, 113, 110,  99,  98,  97,  93,  92,  91,
         91,  88,  87,  82,  74,  71,  69,  69,  68,  61,  52,  47,  45]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]))

In [None]:
# 
doc_topic_list = []
for s, n in zip(topic_sizes, topic_nums):
    documents, document_scores, document_ids = topic_model.search_documents_by_topic(topic_num=n, num_docs = s)
    
    for doc, score, doc_id in zip(documents, document_scores, document_ids):
        doc_topic_list.append({"Document_id": doc_id,
                               "Score": score,
                               "text":doc,
                               "topic_id": n})
doc_topic_list[:2]

[{'Document_id': 3906,
  'Score': 0.88263214,
  'text': 'Hansen( DF)',
  'topic_id': 0},
 {'Document_id': 3404, 'Score': 0.87143725, 'text': 'Lao Tzu¹', 'topic_id': 0}]

In [None]:
doc_topc_df = pd.DataFrame(doc_topic_list)
doc_topc_df.sort_values(by = ["Document_id"], ascending = True, inplace= True)
doc_topc_df = doc_topc_df.reset_index(drop=True)
doc_topc_df.head()

Unnamed: 0,Document_id,Score,text,topic_id
0,0,0.171193,"Vanddamp er en usynlig gas, der forekommer i s...",9
1,1,0.164515,Er der nogen herinde der har erfaring med at k...,13
2,2,0.297734,Ved ikke lige hvordan disse er i størrelsen?,3
3,3,0.398476,Dog kan jeg godt lide pang farver;),6
4,4,0.384771,Pengene bliver dog ofte først udbetalt efter 5...,28


In [None]:
doc_topc_df = pd.concat([doc_topc_df, sentence_df[['id']]], axis = 1)


In [None]:
doc_topc_df.loc[doc_topc_df['id']=='7-21-440',:]

Unnamed: 0,Document_id,Score,text,topic_id,id
4659,4659,0.310199,Ifølge Dansk Kennelklub angriber muskelhunde d...,4,7-21-440


In [None]:
topic4_df = doc_topc_df.loc[doc_topc_df['topic_id']==4,:]
topic4_df.sort_values(by = ['Score'], ascending = False, inplace = True) 
topic4_df_sim = topic4_df.loc[(topic4_df['Score'] >= 0.28) & (topic4_df['Score'] <= 0.32),:]
topic4_df_sim.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(11, 5)

In [None]:
topic4_df_sim.text.values

array(['20 pct. af Europas bønder 80 pct. af EU s samlede landbrugsstøtte, og hvis vi ikke får bremset op for den udvikling, ja, så ender det jo med, at der kun er tre gårde tilbage i et sogn.',
       'Enhedslisten har fremsat forslag om, at regeringen skal indgå aftale med Kommunernes Landsforening og Danske Regioner om økologisk indkøb af fødevarer i det offentlige og om omlægning til økologisk produktion på al offentligt ejet jord.',
       'Vores svenske nabo har ratificeret den.',
       'I 900- tallet ankom svenske vikinger for første gang byen Konstantinopel( som de kaldte Miklagård).',
       'Og så ikke mindst et euroforbehold.',
       'Ifølge Dansk Kennelklub angriber muskelhunde dyr og mennesker cirka hver 14. dag.',
       'Indridason er selv nået til bind nummer ni, og nu udkommer bind seks, Manden i søen, som fjerde Erlendur- udgivelse på dansk.',
       'Læreruddannelsen på KDAS, som er del af Professionshøjskolen København, har på hjemmesiden for KLM formuleret fagets

## Option 6: Pre-trained subword token > tf-idf  / count vectorizer > cosine similarity

In [47]:
!pip install bpemb 

Collecting bpemb
  Downloading https://files.pythonhosted.org/packages/91/77/3f0f53856e86af32b1d3c86652815277f7b5f880002584eb30db115b6df5/bpemb-0.3.2-py3-none-any.whl
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 7.0MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.2 sentencepiece-0.1.95


In [61]:
from bpemb import BPEmb
import re
bpemb_da = BPEmb(lang="da",vs=50000)

In [80]:
def rem_us_token(text):
    return [re.sub(r'\▁','',t) for t in bpemb_da.encode(text)]
rem_us_token(target_sentence.values[0])

['ifølge',
 'dansk',
 'kennel',
 'klub',
 'angriber',
 'muskel',
 'hunde',
 'dyr',
 'og',
 'mennesker',
 'cirka',
 'hver',
 '00.',
 'dag',
 '.']

### Cosine similarity on TF-IDF vectorizer

In [102]:
new_stop_words = [rem_us_token(stop_word)[0] for stop_word in stopwords_da]
# new_stop_words

In [93]:
# fit count vectorizer with Danish stop words
# use the custom tokenizer with Danish Stemmer
tfidf_vect_bpsw = TfidfVectorizer(tokenizer=rem_us_token, stop_words = new_stop_words)
# fit on cleaned text
tfidf_vect_matrix_bpsw = tfidf_vect_bpsw.fit_transform(sentence_df_cp_op1)
# vocab sze 7058
tfidf_vect_matrix_bpsw.shape

(5000, 12937)

In [94]:
# find cosine similarity for the count vector matrix
cosine_matrix_ti_bpsw = cosine_similarity(tfidf_vect_matrix_bpsw)

In [95]:
# convert to Data Frame
cosine_df_ti_bpsw = pd.DataFrame(cosine_matrix_ti_bpsw, columns = sentence_df['id'], index = sentence_df['id'])

In [96]:
# select the row for target sentence
target_cosine_array = cosine_df_ti_bpsw.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
34-2-723      0.193555
38-78-1469    0.177428
42-15-38      0.149892
51-34-3098    0.142000
38-32-1285    0.126503
6-37-308      0.125849
36-77-1455    0.123605
38-19-3921    0.120880
34-15-2241    0.119859
Name: 7-21-440, dtype: float64

In [97]:
# gather indices for top 10 similar sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [98]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'Hvad er det dog, vi mennesker gør ved dig?'

In [99]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'Men det fik ikke den farlige angriber til at skåne City.'

In [100]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Derudover er der to hunde, som kan kaldes ind med kort varsel, hvis det bliver nødvendigt.'

In [101]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'Man kan ikke forvente at mennesker kan tåle at spise planter og dyr fra en fremmed planet, da vi ikke under vores udvikling er blevet udsat for noget sådant.'

In [90]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'Kunne ikke spørge ham.'

### Cosine similarity on Count vectorizer

In [103]:
# fit count vectorizer with Danish stop words
# use the custom tokenizer with Danish Stemmer
count_vect_bpsw = CountVectorizer(tokenizer=rem_us_token, stop_words = new_stop_words)
# fit on cleaned text
count_vect_matrix_bpsw = count_vect_bpsw.fit_transform(sentence_df_cp_op1)
# vocab sze 7058
count_vect_matrix_bpsw.shape

(5000, 12937)

In [104]:
# find cosine similarity for the count vector matrix
cosine_matrix_count_bpsw = cosine_similarity(count_vect_matrix_bpsw)

In [105]:
# convert to Data Frame
cosine_df_count_bpsw = pd.DataFrame(cosine_matrix_count_bpsw, columns = sentence_df['id'], index = sentence_df['id'])

In [106]:
# select the row for target sentence
target_cosine_array = cosine_df_count_bpsw.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
38-32-1285    0.250000
34-2-723      0.204124
28-58-176     0.181902
38-17-2222    0.176777
6-45-1332     0.176777
40-81-2158    0.176777
38-10-1798    0.176777
6-47-638      0.167705
51-34-3098    0.166667
Name: 7-21-440, dtype: float64

In [107]:
# gather indices for top 10 similar sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [108]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'Kunne ikke spørge ham.'

In [109]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'Hvad er det dog, vi mennesker gør ved dig?'

In [110]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Elbiler er for dyre, så længe produktionstallet er lavt, og så længe bilerne er dyre, er de svære at sælge.'

In [111]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'Måske har du den forkerte læge?'

In [112]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'Jeg kan kun spørge:'

## Option 7: Byte-pair encoding > embedding vectors > cosine similarity

In [174]:
# get the embedding vectors for each sub-word in a sentence 
bpemb_da_ids = bpemb_da.encode_ids(target_sentence.values[0])
# embedding mean for each sentence with dimension - (1, embedding dim)
bpemb_da.vectors[bpemb_da_ids].mean(axis = 0).shape

(100,)

In [148]:
# apply on the cleaned sentences
bpemb_da_vect_mat = sentence_df_cp_op1.apply(lambda x: bpemb_da.embed(x).mean(axis = 0))
# convert to ndarray
bpemb_da_vect_mat = np.array(bpemb_da_vect_mat.tolist())

  """Entry point for launching an IPython kernel.


In [160]:
# cases where emedding value is NaN
print(np.where(np.isnan(bpemb_da_vect_mat)))
# convert them to 0
bpemb_da_vect_mat_cln = np.nan_to_num(bpemb_da_vect_mat)

(array([ 246,  246,  246, ..., 4211, 4211, 4211]), array([ 0,  1,  2, ..., 97, 98, 99]))


In [162]:
# check few sample values
bpemb_da_vect_mat_cln[4211,1]

0.0

### Cosine similarity

In [163]:
# pairewise cosine similarity between sentence embeddings
cosine_matrix = cosine_similarity(bpemb_da_vect_mat_cln)

In [164]:
# convert to dataframe
cosine_df = pd.DataFrame(cosine_matrix, columns = sentence_df['id'], index = sentence_df['id'])

In [165]:
# select the row for target sentence
target_cosine_array = cosine_df.loc['7-21-440',:]
# sort descending
target_cosine_array.sort_values(ascending = False)[:10]

id
7-21-440      1.000000
7-61-865      0.585707
34-86-122     0.585339
34-89-799     0.579532
6-47-994      0.576989
36-84-3253    0.571422
46-55-17      0.570974
36-75-1782    0.569270
38-13-1403    0.566970
51-87-5225    0.561458
Name: 7-21-440, dtype: float32

In [166]:
# gather indices for top 10 similr sentences
top_10_similarity = target_cosine_array.sort_values(ascending = False)[:10].index.tolist()

In [167]:
sentence_df.loc[sentence_df.id == top_10_similarity[1],'text'].values[0]

'En hund er et rovdyr.'

In [168]:
sentence_df.loc[sentence_df.id == top_10_similarity[2],'text'].values[0]

'De danske håndbold- musketerer bruger ikke kun bold og harpiks.'

In [169]:
sentence_df.loc[sentence_df.id == top_10_similarity[3],'text'].values[0]

'Eller 1350 kroner for hver af de otte deltagere.'

In [170]:
sentence_df.loc[sentence_df.id == top_10_similarity[4],'text'].values[0]

'I dag er der ca.'

In [171]:
sentence_df.loc[sentence_df.id == top_10_similarity[5],'text'].values[0]

'Derudover er der flere danske profiler, som vender hjem fra udlandet.'