In [4]:
import pandas as pd

from helpsk.utility import read_pickle
from helpsk.logging import Timer

In [3]:
ngrams_low = 1
ngrams_high = 3
num_clusters = 10

In [37]:
with Timer("Loading Data"):
    path = '/code/artifacts/data/processed/un-general-debates-paragraphs.pkl'
    paragraphs = pd.read_pickle(path)
    paragraphs = paragraphs.sample(5000, random_state=42)
    
with Timer("Loading TF-IDF vectorizer/model via NMF"):
    _file = f'/code/artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    tfidf_vectorizer = read_pickle(_file)

    _file = f'/code/artifacts/models/topics/nmf-topics-10-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    tfidf_vectors = read_pickle(_file)
    
    _file = f'/code/artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    tfidf_model = read_pickle(_file)

with Timer("Loading Count vectorizer/model via LDA"):
    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    count_vectorizer = read_pickle(_file)

    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    count_vectors = read_pickle(_file)
    
    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    count_model = read_pickle(_file)

In [38]:
paragraphs.head()

Unnamed: 0,year,country,text
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, af..."
113526,1988,Turkey,With this understanding we have initiated a di...
224796,2006,Central African Republic,"The recent conference on AIDS, held here at \n..."
130703,1991,Sweden,The Swedish Government strongly supports the e...
59553,1980,Mozambique,76.\tDue to the tolerance shown to South Afric...


# Cosine Similarity

In [39]:
paragraphs

Unnamed: 0,year,country,text
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, af..."
113526,1988,Turkey,With this understanding we have initiated a di...
224796,2006,Central African Republic,"The recent conference on AIDS, held here at \n..."
130703,1991,Sweden,The Swedish Government strongly supports the e...
59553,1980,Mozambique,76.\tDue to the tolerance shown to South Afric...
...,...,...,...
140522,1993,Cuba,"The new international economic order, three\ni..."
166859,1996,Mauritania,"It is gratifying to note that, as the Organiza..."
99907,1986,Luxembourg,Our third cause for concern is undoubtedly con...
210583,2003,Luxembourg,As was recently noted by the Secretary-General...


In [40]:
from sklearn.metrics.pairwise import cosine_similarity


# note we need to build a pipeline that cleans not just the training data but also new data if we want to use it for *search*
# This would also ensure consistency e.g. i accidently used text_clean instead of all_lemmas    
example = paragraphs['text'].iloc[0]
example

'Indeed, I need not reconfirm the fact that, after \nthe experiences of war and suffering that we have been \nthrough, unless the question of Palestine and that of the \ncontinuing occupation of Palestinian and Arab lands \nsince 1967 is resolved, the elements of tension and \nconflagration will keep the conflict alive and leave the \ndoor wide open to all forms of violence, terrorism, \nregional confrontations and global crises'

In [41]:
example_vector = tfidf_vectorizer.transform([example])
example_vector.shape

(1, 7852)

In [42]:
# calculate cosine similarity between the original vectors (i.e. tfidf_vectors) and our example
example_cosine_sim = cosine_similarity(tfidf_vectors, example_vector)
example_cosine_sim = example_cosine_sim.reshape(1, -1)[0]
example_cosine_sim.shape

(5000,)

In [44]:
top_n = 15
_temp_sample = paragraphs.copy()
_temp_sample['similarities'] = example_cosine_sim

top_n_examples = _temp_sample.sort_values('similarities', ascending=False).head(top_n)
assert round(top_n_examples['similarities'].iloc[0], 4) == 1

In [45]:
top_n_examples

Unnamed: 0,year,country,text,similarities
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, af...",1.0
54940,1979,Yemen,49.\tThe international community unanimously r...,0.17164
18124,1974,United Arab Emirates,198.\tOur position on the question of Palestin...,0.169033
92629,1985,Iraq,"The Palestinian question, including the rights...",0.164453
47040,1978,Sao Tome and Principe,108.\tThe position of my Government concerning...,0.156974
263055,2012,Turkey,"For instance, we have time and again declared\...",0.153039
89228,1984,United States,"27.\tBut any economic progress, as well as any...",0.149119
53325,1979,Qatar,"10.\tThe State of Qatar, in fulfilment of what...",0.145091
207437,2002,Zimbabwe,The United Nations is confronted with a volati...,0.138416
207053,2002,"Tanzania, United Republic of",The framework for the resumption of negotiatio...,0.135827


In [46]:
print(top_n_examples['text'].iloc[0])

Indeed, I need not reconfirm the fact that, after 
the experiences of war and suffering that we have been 
through, unless the question of Palestine and that of the 
continuing occupation of Palestinian and Arab lands 
since 1967 is resolved, the elements of tension and 
conflagration will keep the conflict alive and leave the 
door wide open to all forms of violence, terrorism, 
regional confrontations and global crises


In [47]:
print(top_n_examples['text'].iloc[1])

49.	The international community unanimously recognizes that a just and lasting peace cannot be achieved if it does not include the basic elements that we have just set forth. But the Zionist entity, which professes to want peace, arrogantly and obstinately opposes this unanimous will of the international community and continues its aggression and its occupation of Palestine and other independent and sovereign Arab countries neighbouring Palestine. Moreover, the decision taken on 16 September last by the Council of Ministers of Israel, under which Israelis will be allowed to acquire Arab lands and property on the West Bank and in Jerusalem, again confirms Israel's determination to pursue its policy of Occupation and aggression and to undermine the chances for a just and lasting peace in the area


---