In [1]:
import nltk
from gensim import corpora, models, similarities
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# **load data**

In [2]:
with open ("/kaggle/input/word2vec/data", "r") as myfile:
    data = myfile.read().splitlines()

In [3]:
data

['**Morocco and Marrakech: A Tapestry of Tradition and Modernity** Morocco, located at the crossroads of Europe and Africa, is a country drenched in history, mystery, and cultural richness. A testament to the ancient civilizations that once flourished here, this North African kingdom boasts a unique blend of Arab, Berber, and European influences. At the heart of Morocco\'s rich tapestry lies Marrakech, one of its four imperial cities and a vibrant epicenter of tradition and modernity. **Geographical Significance** Morocco is bordered by the Atlantic Ocean to the west, the Mediterranean Sea to the north, Algeria to the east and southeast, and the vast Sahara desert to the south. Its strategic location has historically made it a sought-after territory and a melting pot of cultures, religions, and trade routes. **Marrakech: The Red City** Marrakech, often referred to as "The Red City" due to its distinctive red-hued buildings, stands against the backdrop of the snow-capped Atlas Mountains

# **clean data**

In [4]:
mod_data = []
for w in data:
    mod_string = w.replace("**","")
    mod_data.append(mod_string)

**function to remove punctuation**

In [5]:
import string
def remove_punctuation(text):
    translator = str.maketrans('','',string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation

**function to remove stopwords**

In [6]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords.words('english')])

**function for lemmitization**

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')
def lemmatizing(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [8]:
text_no_punc = []
for w in mod_data:
    text_no_punc.append(remove_punctuation(mod_string))

In [9]:
text_no_punc

['Morocco and Marrakech A Tapestry of Tradition and Modernity Morocco located at the crossroads of Europe and Africa is a country drenched in history mystery and cultural richness A testament to the ancient civilizations that once flourished here this North African kingdom boasts a unique blend of Arab Berber and European influences At the heart of Moroccos rich tapestry lies Marrakech one of its four imperial cities and a vibrant epicenter of tradition and modernity Geographical Significance Morocco is bordered by the Atlantic Ocean to the west the Mediterranean Sea to the north Algeria to the east and southeast and the vast Sahara desert to the south Its strategic location has historically made it a soughtafter territory and a melting pot of cultures religions and trade routes Marrakech The Red City Marrakech often referred to as The Red City due to its distinctive redhued buildings stands against the backdrop of the snowcapped Atlas Mountains Established in the 11th century it has r

In [10]:
text_no_sw = []
for w in text_no_punc:
    text_no_sw.append(remove_stopwords(w))

In [11]:
text_no_sw

['Morocco Marrakech A Tapestry Tradition Modernity Morocco located crossroads Europe Africa country drenched history mystery cultural richness A testament ancient civilizations flourished North African kingdom boasts unique blend Arab Berber European influences At heart Moroccos rich tapestry lies Marrakech one four imperial cities vibrant epicenter tradition modernity Geographical Significance Morocco bordered Atlantic Ocean west Mediterranean Sea north Algeria east southeast vast Sahara desert south Its strategic location historically made soughtafter territory melting pot cultures religions trade routes Marrakech The Red City Marrakech often referred The Red City due distinctive redhued buildings stands backdrop snowcapped Atlas Mountains Established 11th century remained crucial political economic cultural center Morocco Journey Medina Marrakechs old town Medina UNESCO World Heritage site labyrinthine maze narrow alleys bustling souks historical landmarks The Djemaa elFna Square li

In [12]:
text_lemmatized = []
for w in text_no_sw:
    text_lemmatized.append(lemmatizing(w))

In [13]:
text_lemmatized

['Morocco Marrakech A Tapestry Tradition Modernity Morocco locate crossroad Europe Africa country drench history mystery cultural richness a testament ancient civilization flourish north african kingdom boast unique blend arab Berber european influence at heart Moroccos rich tapestry lie Marrakech one four imperial city vibrant epicenter tradition modernity geographical Significance Morocco border Atlantic Ocean west Mediterranean Sea north Algeria east southeast vast Sahara desert south its strategic location historically make soughtafter territory melting pot culture religion trade route Marrakech the Red City Marrakech often refer the Red City due distinctive redhue building stand backdrop snowcappe Atlas Mountains establish 11th century remain crucial political economic cultural center Morocco Journey Medina Marrakechs old town Medina UNESCO World Heritage site labyrinthine maze narrow alley bustle souks historical landmark the Djemaa elfna Square lie heart Medina come alive every 

# **Tokenization**

In [14]:
tokenized_corpus = [nltk.word_tokenize(w)for w in text_lemmatized]

# **initializing the model**

In [15]:
w2v_model = models.Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, sg=1)

# **saving the model**

In [16]:
w2v_model.save('w2v_model')

# **Loading the model**

In [17]:
model = Word2Vec.load('/kaggle/working/w2v_model')

# **Vectorial Representation**

In [18]:
word_vecs = []
for word in tokenized_corpus[0]:
    if word in model.wv.index_to_key:
        vector = model.wv[word]
        word_vecs.append((word, vector))
    else:
        word_vectors.append((word, None))

# **Similarities**

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import itertools

In [20]:
similarity_scores = {}
for w1, w2 in itertools.combinations(word_vecs, 2):
    word1 = w1[0]
    vector1 = w1[1]
    word2 = w2[0]
    vector2 = w2[1]
    similarity = cosine_similarity([vector1],[vector2])[0][0]
    similarity_scores[(word1, word2)] = similarity

In [21]:
similarity_scores

{('Morocco', 'Marrakech'): 0.0040270034,
 ('Morocco', 'A'): 0.06316205,
 ('Morocco', 'Tapestry'): 0.039247967,
 ('Morocco', 'Tradition'): -0.07628815,
 ('Morocco', 'Modernity'): 0.024023332,
 ('Morocco', 'Morocco'): 1.0000002,
 ('Morocco', 'locate'): 0.044367556,
 ('Morocco', 'crossroad'): -0.21409439,
 ('Morocco', 'Europe'): 0.00017673173,
 ('Morocco', 'Africa'): 0.045989435,
 ('Morocco', 'country'): 0.04639391,
 ('Morocco', 'drench'): -0.21707858,
 ('Morocco', 'history'): -0.099658094,
 ('Morocco', 'mystery'): 0.06707465,
 ('Morocco', 'cultural'): -0.014020264,
 ('Morocco', 'richness'): 0.030593462,
 ('Morocco', 'a'): -0.2237588,
 ('Morocco', 'testament'): -0.00065753167,
 ('Morocco', 'ancient'): 0.19023006,
 ('Morocco', 'civilization'): -0.0016548485,
 ('Morocco', 'flourish'): -0.053755783,
 ('Morocco', 'north'): 0.16563228,
 ('Morocco', 'african'): 0.101218745,
 ('Morocco', 'kingdom'): 0.026761828,
 ('Morocco', 'boast'): -0.044352293,
 ('Morocco', 'unique'): 0.16011287,
 ('Morocco'

# **Contextual words**

In [22]:
window_size = 2
contextual_words = {}
for i, word in enumerate(tokenized_corpus[0]):
    left_context = tokenized_corpus[0][max(0, i - window_size):i]
    right_context = tokenized_corpus[0][i + 1:i + 1 + window_size]
    contextual_words[word] = left_context + right_context

In [23]:
contextual_words    

{'Morocco': ['in', 'case', 'Marrakech', 'value'],
 'Marrakech': ['case', 'Morocco', 'value', 'man'],
 'A': ['Morocco', 'Marrakech', 'Tapestry', 'Tradition'],
 'Tapestry': ['Marrakech', 'A', 'Tradition', 'Modernity'],
 'Tradition': ['A', 'Tapestry', 'Modernity', 'Morocco'],
 'Modernity': ['Tapestry', 'Tradition', 'Morocco', 'locate'],
 'locate': ['Modernity', 'Morocco', 'crossroad', 'Europe'],
 'crossroad': ['Morocco', 'locate', 'Europe', 'Africa'],
 'Europe': ['locate', 'crossroad', 'Africa', 'country'],
 'Africa': ['crossroad', 'Europe', 'country', 'drench'],
 'country': ['Europe', 'Africa', 'drench', 'history'],
 'drench': ['Africa', 'country', 'history', 'mystery'],
 'history': ['the', 'convergence', 'culture', 'architecture'],
 'mystery': ['drench', 'history', 'cultural', 'richness'],
 'cultural': ['political', 'economic', 'center', 'Morocco'],
 'richness': ['mystery', 'cultural', 'a', 'testament'],
 'a': ['cultural', 'richness', 'testament', 'ancient'],
 'testament': ['richness', 