In [10]:
import pandas as pd

from helpsk.utility import read_pickle
from helpsk.logging import Timer

In [11]:
ngrams_low = 1
ngrams_high = 3
num_clusters = 10

In [12]:
with Timer("Loading Data"):
    path = '/code/artifacts/data/processed/un_debate_paragraphs.pkl'
    paragraphs = pd.read_pickle(path)
    paragraphs = paragraphs.sample(5000, random_state=42)
    
with Timer("Loading TF-IDF vectorizer/model via NMF"):
    _file = f'/code/artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    tfidf_vectorizer = read_pickle(_file)

    _file = f'/code/artifacts/models/topics/nmf-topics-10-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    tfidf_vectors = read_pickle(_file)
    
    _file = f'/code/artifacts/models/topics/nmf-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    tfidf_model = read_pickle(_file)

with Timer("Loading Count vectorizer/model via LDA"):
    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectorizer.pkl'
    count_vectorizer = read_pickle(_file)

    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__vectors.pkl'
    count_vectors = read_pickle(_file)
    
    _file = f'/code/artifacts/models/topics/lda-topics-{num_clusters}-ngrams-{ngrams_low}-{ngrams_high}__model.pkl'  # noqa
    count_model = read_pickle(_file)

In [13]:
paragraphs.head()

Unnamed: 0,year,country,text
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, af..."
113526,1988,Turkey,With this understanding we have initiated a di...
224796,2006,Central African Republic,"The recent conference on AIDS, held here at \n..."
130703,1991,Sweden,The Swedish Government strongly supports the e...
59553,1980,Mozambique,76.\tDue to the tolerance shown to South Afric...


# Cosine Similarity - TF-IDF

In [14]:
paragraphs

Unnamed: 0,year,country,text
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, af..."
113526,1988,Turkey,With this understanding we have initiated a di...
224796,2006,Central African Republic,"The recent conference on AIDS, held here at \n..."
130703,1991,Sweden,The Swedish Government strongly supports the e...
59553,1980,Mozambique,76.\tDue to the tolerance shown to South Afric...
...,...,...,...
140522,1993,Cuba,"The new international economic order, three\ni..."
166859,1996,Mauritania,"It is gratifying to note that, as the Organiza..."
99907,1986,Luxembourg,Our third cause for concern is undoubtedly con...
210583,2003,Luxembourg,As was recently noted by the Secretary-General...


In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# note we need to build a pipeline that cleans not just the training data but also new data if we want to use it for *search*
# This would also ensure consistency e.g. i accidently used text_clean instead of all_lemmas    
example = paragraphs['text'].iloc[0]
example

'Indeed, I need not reconfirm the fact that, after \nthe experiences of war and suffering that we have been \nthrough, unless the question of Palestine and that of the \ncontinuing occupation of Palestinian and Arab lands \nsince 1967 is resolved, the elements of tension and \nconflagration will keep the conflict alive and leave the \ndoor wide open to all forms of violence, terrorism, \nregional confrontations and global crises'

In [50]:
example_vector = tfidf_vectorizer.transform([example])
example_vector.shape

(1, 7852)

In [51]:
# calculate cosine similarity between the original vectors (i.e. tfidf_vectors) and our example
example_cosine_sim = cosine_similarity(tfidf_vectors, example_vector)
example_cosine_sim = example_cosine_sim.reshape(1, -1)[0]
example_cosine_sim.shape

(5000,)

In [75]:
top_n = 10
_temp_sample = paragraphs.copy()
_temp_sample['similarities'] = example_cosine_sim

top_n_examples = _temp_sample.sort_values('similarities', ascending=False).head(top_n)
assert round(top_n_examples['similarities'].iloc[0], 4) == 1

In [76]:
print(top_n_examples['text'].iloc[0])

Indeed, I need not reconfirm the fact that, after 
the experiences of war and suffering that we have been 
through, unless the question of Palestine and that of the 
continuing occupation of Palestinian and Arab lands 
since 1967 is resolved, the elements of tension and 
conflagration will keep the conflict alive and leave the 
door wide open to all forms of violence, terrorism, 
regional confrontations and global crises


In [77]:
print(top_n_examples['text'].iloc[1])

49.	The international community unanimously recognizes that a just and lasting peace cannot be achieved if it does not include the basic elements that we have just set forth. But the Zionist entity, which professes to want peace, arrogantly and obstinately opposes this unanimous will of the international community and continues its aggression and its occupation of Palestine and other independent and sovereign Arab countries neighbouring Palestine. Moreover, the decision taken on 16 September last by the Council of Ministers of Israel, under which Israelis will be allowed to acquire Arab lands and property on the West Bank and in Jerusalem, again confirms Israel's determination to pursue its policy of Occupation and aggression and to undermine the chances for a just and lasting peace in the area


In [78]:
pd.set_option('display.max_colwidth', 300)
top_n_examples[['year', 'country', 'text', 'similarities']].\
    style

Unnamed: 0,year,country,text,similarities
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, after the experiences of war and suffering that we have been through, unless the question of Palestine and that of the continuing occupation of Palestinian and Arab lands since 1967 is resolved, the elements of tension and conflagration will keep the conflict alive and leave the door wide open to all forms of violence, terrorism, regional confrontations and global crises",1.0
54940,1979,Yemen,"49.	The international community unanimously recognizes that a just and lasting peace cannot be achieved if it does not include the basic elements that we have just set forth. But the Zionist entity, which professes to want peace, arrogantly and obstinately opposes this unanimous will of the international community and continues its aggression and its occupation of Palestine and other independent and sovereign Arab countries neighbouring Palestine. Moreover, the decision taken on 16 September last by the Council of Ministers of Israel, under which Israelis will be allowed to acquire Arab lands and property on the West Bank and in Jerusalem, again confirms Israel's determination to pursue its policy of Occupation and aggression and to undermine the chances for a just and lasting peace in the area",0.17164
18124,1974,United Arab Emirates,"198.	Our position on the question of Palestine is known to all. We have often stated our views regarding the aggressive nature of Zionism and denounced Israel's aggression and occupation of Arab territories. We shall, nevertheless, state in greater detail our views during the discussion of the question of Palestine and other relevant items. We should like, however, to reiterate now our demand for Israel's withdrawal from all occupied territories. We shall continue to extend every possible assistance for the achievement of that goal",0.169033
92629,1985,Iraq,"The Palestinian question, including the rights of the Arab Palestinian people, has been one of the most important issues in the United Nations since its foundation, while the tragedy of this dispersed people remains a pressing matter, the Arabs of Palestine are repeatedly subjected to massacre and extermination. The objective has always been to eliminate this people's heritage, identity and very existence. The Palestinians continue to suffer the most inhuman treatment at the hands of their Zionist oppressors, whose racist measures have included not only expulsion, dispersion, confiscation of property, and so on, but also the establishment of Zionist settlements on Arab lands",0.164453
47040,1978,Sao Tome and Principe,"108.	The position of my Government concerning the Palestinian question, which arose soon after the Second World War, is well known. Israel must be asked to abandon its policy of aggression, occupation and expansion, a policy which keeps alive the existing conflict in the Middle East, which we all deplore. To be more explicit, Israel must be asked to withdraw from the Arab territories occupied since 1967 and to recognize the right of the Palestinian people to a homeland. Peace in that region is closely linked with a change of attitude by Israel, which must henceforward understand that war is in no way beneficial to the development and progress of the world or to understanding among the nations and peoples of the earth. War renders ill service to international peace and security. War is destructive from every point of view",0.156974
263055,2012,Turkey,"For instance, we have time and again declared our support for a two-State solution to the question of Palestine and adopted numerous resolutions to that end. However, we still hope that, one day soon, Palestine will be represented as an equal member in the Assembly",0.153039
89228,1984,United States,"27.	But any economic progress, as well as any movement in the direction of greater understanding between the nations of the world, is, of course, endangered by the prospect of conflict at both the global and the regional levels. In a few minutes I will turn to the menace of conflict on a world-wide scale and discuss the status of negotiations between the United States and the Soviet Union. But permit me first to address the critical problem of regional conflicts, for history displays tragic evidence that it is these conflicts which can set off the sparks leading to world-wide conflagration",0.149119
53325,1979,Qatar,"10.	The State of Qatar, in fulfilment of what was unanimously agreed upon by the Arabs and in the resolutions of Arab Summit Conferences, particularly those of Algiers in 1974, Rabat in 1974 and Baghdad in 1978, and in accordance with the international community's stand, as reflected in the resolutions of the United Nations and the Sixth Conference of Heads of State or Government of Non-Aligned Countries, reaffirms that any peaceful, just and comprehensive settlement of the Arab-Israeli conflict should be based on the following main principles: first, the total withdrawal of Israel from all Arab territories occupied by force in 1967, including Arab Jerusalem; secondly, the recognition of the inalienable rights of the Palestinian people and the exercise of their legitimate rights to self-determination, independence and sovereignty in Palestine, their homeland; thirdly, the right of the PLO, the sole legitimate representative of the Palestinian people, to participate as an independent and equal partner in all international conferences, activities and international forums concerned with the Palestinian question and the Middle East dispute, which means that no other party may have the right to speak on behalf of the Palestinian people; and fourthly, the invalidity of all agreements pertaining to the Palestinian question unless the PLO, as the sole legitimate representative of the Palestinian people, is a principal partner to them, in accordance with General Assembly resolution 33/28 A. Hence, the implications of the Camp David agreements, which are considered by the Government of Qatar as a violation of the Charter of the League of Arab States, international legitimacy and the resolutions of the United Nations in relation to the Palestine question and the occupied Arab and Palestinian territories, should be declared null and void. Furthermore, these agreements ignored the legitimate rights of the Palestinian people, including their right to establish their independent State in Palestine, and also completely ignored the question of occupied Arab Jerusalem",0.145091
207437,2002,Zimbabwe,"The United Nations is confronted with a volatile situation in the Middle East that has the potential to engulf the entire subregion. The Palestinian question should be resolved without further delay, as it is causing untold suffering to the people in the occupied 25 territories. Israel must withdraw its forces from Palestinian lands, and the Palestinians must be afforded the opportunity of having a State of their own. The carnage that is going on at present will benefit neither side. Israel must know that its chances for peace and security lie in having a Palestinian State that will live side by side with it in mutual respect between sovereign States",0.138416
207053,2002,"Tanzania, United Republic of","The framework for the resumption of negotiations between Israel and the Government of Palestine already exists. Both sides must create an environment conducive to negotiations. There has to be an end to Israeli occupation of Arab lands and significant movement towards the creation of a viable Palestinian State living side by side with Israel as an imperative to lasting peace in the Middle East. The security concerns of Israel should be addressed and encoded properly in any agreement. In this regard, all relevant Security Council and General Assembly resolutions must be adhered to",0.135827


---

# Cosine Similarity - Embeddings

In [81]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [29]:
paragraph_embeddings = model.encode(paragraphs.text.tolist())
paragraph_embeddings.shape

(5000, 384)

In [38]:
query_embedding = model.encode(paragraphs.text.tolist()[0])
similarities = util.dot_score(query_embedding, paragraph_embeddings)[0]
similarities

tensor([1.0000, 0.3442, 0.3606,  ..., 0.3995, 0.2873, 0.3785])

---

In [114]:
cosine_sims = cosine_similarity(paragraph_embeddings, query_embedding.reshape(1, -1))
# looks like cosine_similarity and dot_score return same thing
print(cosine_sims[0:5].reshape(1, -1))
print(similarities[0:5])

[[1.         0.34419763 0.3605953  0.36372387 0.19492552]]
tensor([1.0000, 0.3442, 0.3606, 0.3637, 0.1949])


---

In [70]:
top_n = 10
_temp_sample = paragraphs.copy()
_temp_sample['similarities'] = similarities
top_n_examples = _temp_sample.sort_values('similarities', ascending=False).head(top_n)
assert round(top_n_examples['similarities'].iloc[0], 4) == 1

In [74]:
pd.set_option('display.max_colwidth', 300)
top_n_examples[['year', 'country', 'text', 'similarities']].\
    style

Unnamed: 0,year,country,text,similarities
228015,2006,"Palestine, State of","Indeed, I need not reconfirm the fact that, after the experiences of war and suffering that we have been through, unless the question of Palestine and that of the continuing occupation of Palestinian and Arab lands since 1967 is resolved, the elements of tension and conflagration will keep the conflict alive and leave the door wide open to all forms of violence, terrorism, regional confrontations and global crises",1.0
224905,2006,China,"The question of Palestine is at the core of the Middle East issue. The peace process should be re- started in keeping with the relevant resolutions of the United Nations and the principle of land for peace, in the interest of reaching an early and comprehensive settlement of the Middle East issue, including the Palestinian-Israeli conflict, so that the countries involved can live in harmony and the region can enjoy durable peace",0.757447
47040,1978,Sao Tome and Principe,"108.	The position of my Government concerning the Palestinian question, which arose soon after the Second World War, is well known. Israel must be asked to abandon its policy of aggression, occupation and expansion, a policy which keeps alive the existing conflict in the Middle East, which we all deplore. To be more explicit, Israel must be asked to withdraw from the Arab territories occupied since 1967 and to recognize the right of the Palestinian people to a homeland. Peace in that region is closely linked with a change of attitude by Israel, which must henceforward understand that war is in no way beneficial to the development and progress of the world or to understanding among the nations and peoples of the earth. War renders ill service to international peace and security. War is destructive from every point of view",0.735725
136871,1992,Oman,"The problem of Palestine has been the major factor underlying the tragedies and wars witnessed by the peoples of the Middle East region. Peace and stability in this important and delicate region cannot be achieved without finding a solution to this problem, which is the crux of the Arab-Israeli conflict",0.731984
94917,1985,Saudi Arabia,"The problems that our region is facing and the complications that appear on the ground there are in fact, and in the final analysis, only repercussions of the Zionist aggression against Palestine, and the actual result of the accumulation of developments in the Palestine problem. The wars and political and military conflicts through which the area has been living for the past 33 years are but one effect of the failure to solve the basic problem, and are a result of Israeli aggression in the area. The perpetuation of Israeli occupation of Arab territories is living proof of its designs for expansion, its flagrant defiance of United Nations resolutions and its constant disregard for world public opinion and every law and convention",0.726528
143786,1993,Mongolia,"Furthermore, solutions are being found even to the most protracted regional conflicts. The recent historic act of mutual recognition between Israel and the Palestine Liberation Organization (PLO) and the interim peace agreement are a case in point. We hope that this will lead to a just and comprehensive solution to the Arab-Israeli conflict",0.713083
200785,2001,"Palestine, State of","We welcome the positive positions taken by President George W. Bush and other leaders who have called for the establishment of a Palestinian State. We believe this constitutes a significant step towards ending the conflict and establishing peace in the Middle East. I will say candidly to you that reviving and completing the peace process will need a new qualitative push, and after all that has happened it will not be possible to confine ourselves to interim solutions. It is impossible, of course, to achieve another interim agreement, as called for by some. To control the situation on the ground and to get the situation back to the way it existed before 28 September 2000 requires clear political imagination and new hope",0.702671
148474,1994,Congo,"Firmly and resolutely, the dynamic of peace begun in the Middle East is growing stronger and is opening prospects for cooperation to peoples long ruined by war and yet fated by geography and history to live together",0.694066
119687,1989,Democratic Yemen,"This positive initiative has been accompanied by the steadfastness of the Palestinian people's intifadah in the occupied territories. From this rostrum we salute the continued courage of the Palestinian people engaged in an in equal confrontation with the Israeli occupation, it is important to state that Israel's true image has been clearly revealed to world public opinion. Does any doubt still persist about the fact that Israel does not want peace to prevail in the region? Does any doubt still persist about another fact that Israel's policy is based on expansion and colonial settlement? Has it not yet been proved beyond any doubt through clear, tangible, irrefutable evidence, that Israel violates human rights in the occupied territories? Has the world forgotten the crimes committed by Israel against the defenceless Palestinian people, burying them alive and breaking their bones? The international community as a whole calls for terminating the Israeli occupation of Palestine and other Arab territories. It calls for seizing the opportunity to proceed with the preparations for the convening of the International Peace Conference on the Middle East, in accordance with numerous successive resolutions adopted by the General Assembly, with the participation on an equal footing of all parties concerned, including the Palestine Liberation Organization, the sole legitimate representative of the Palestinian people. That is the right approach, agreed upon internationally it would lead to a just political settlement in the Middle East that would guarantee the national inalienable rights of the Palestinian people, foremost among which is their right to self-determination and to the establishment of their independent State",0.692149
207437,2002,Zimbabwe,"The United Nations is confronted with a volatile situation in the Middle East that has the potential to engulf the entire subregion. The Palestinian question should be resolved without further delay, as it is causing untold suffering to the people in the occupied 25 territories. Israel must withdraw its forces from Palestinian lands, and the Palestinians must be afforded the opportunity of having a State of their own. The carnage that is going on at present will benefit neither side. Israel must know that its chances for peace and security lie in having a Palestinian State that will live side by side with it in mutual respect between sovereign States",0.691404


---