In [24]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Fonction de prétraitement du texte
def preprocess_text(text):

    # Suppression de la ponctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

    # Suppression des mots vides
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

In [26]:
# Lecture du fichier texte
file_path = "/content/text"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
print(text)

**Morocco and Marrakech: A Tapestry of Tradition and Modernity** Morocco, located at the crossroads of Europe and Africa, is a country drenched in history, mystery, and cultural richness. A testament to the ancient civilizations that once flourished here, this North African kingdom boasts a unique blend of Arab, Berber, and European influences. At the heart of Morocco's rich tapestry lies Marrakech, one of its four imperial cities and a vibrant epicenter of tradition and modernity. **Geographical Significance** Morocco is bordered by the Atlantic Ocean to the west, the Mediterranean Sea to the north, Algeria to the east and southeast, and the vast Sahara desert to the south. Its strategic location has historically made it a sought-after territory and a melting pot of cultures, religions, and trade routes. **Marrakech: The Red City** Marrakech, often referred to as "The Red City" due to its distinctive red-hued buildings, stands against the backdrop of the snow-capped Atlas Mountains. E

In [27]:
# Prétraitement du texte
preprocessed_text = preprocess_text(text)

#Affichage du texte et liste
print(preprocessed_text)
text = ' '.join(preprocessed_text)
print("\n\n")
print(text)


['morocco', 'marrakech', 'tapestry', 'tradition', 'modernity', 'morocco', 'located', 'crossroad', 'europe', 'africa', 'country', 'drenched', 'history', 'mystery', 'cultural', 'richness', 'testament', 'ancient', 'civilization', 'flourished', 'north', 'african', 'kingdom', 'boast', 'unique', 'blend', 'arab', 'berber', 'european', 'influence', 'heart', 'morocco', 'rich', 'tapestry', 'lie', 'marrakech', 'one', 'four', 'imperial', 'city', 'vibrant', 'epicenter', 'tradition', 'modernity', 'geographical', 'significance', 'morocco', 'bordered', 'atlantic', 'ocean', 'west', 'mediterranean', 'sea', 'north', 'algeria', 'east', 'southeast', 'vast', 'sahara', 'desert', 'south', 'strategic', 'location', 'ha', 'historically', 'made', 'soughtafter', 'territory', 'melting', 'pot', 'culture', 'religion', 'trade', 'route', 'marrakech', 'red', 'city', 'marrakech', 'often', 'referred', 'red', 'city', 'due', 'distinctive', 'redhued', 'building', 'stand', 'backdrop', 'snowcapped', 'atlas', 'mountain', 'estab

In [28]:
# Entraînement du modèle Word2Vec
model = Word2Vec([preprocessed_text], vector_size=100, window=5, min_count=1, workers=4)
print("le modele est :\n",model)

le modele est :
 Word2Vec<vocab=210, vector_size=100, alpha=0.025>


In [32]:
# Répondre aux questions

# 1- Extraire la représentation vectorielle d'un mot
word_representation = model.wv['city']
print("Représentation vectorielle du mot 'city ':\n", word_representation)
print("\n")


Représentation vectorielle du mot 'city ':
 [ 6.6840606e-05  3.1188640e-03 -6.8116994e-03 -1.3588533e-03
  7.6577305e-03  7.2906408e-03 -3.6394231e-03  2.7852328e-03
 -8.3541395e-03  6.1603719e-03 -4.6651294e-03 -3.2402035e-03
  9.3045970e-03  8.8078494e-04  7.4190558e-03 -6.1002760e-03
  5.1966719e-03  9.8368702e-03 -8.4752934e-03 -5.2442933e-03
 -7.0639197e-03 -4.8781210e-03 -3.7897250e-03 -8.5581690e-03
  7.9859551e-03 -4.8291935e-03  8.3947172e-03  5.2166432e-03
 -6.6138227e-03  3.9728591e-03  5.4788487e-03 -7.3660547e-03
 -7.3885047e-03 -2.5590172e-03 -8.6483825e-03 -1.4446034e-03
 -3.9377570e-04  3.2494597e-03  1.4309619e-03 -9.8068221e-04
 -5.6739775e-03  1.6239820e-03 -9.3661557e-04  6.7910221e-03
  4.0455121e-03  4.5521399e-03  1.3829398e-03 -2.6977756e-03
 -4.3734605e-03 -9.9467742e-04  1.5070331e-03 -2.6580691e-03
 -7.0523089e-03 -7.8059910e-03 -9.1274185e-03 -5.9446665e-03
 -1.8378758e-03 -4.3540122e-03 -6.5469230e-03 -3.6824360e-03
  4.2632846e-03 -3.7314943e-03  8.3786864

In [33]:
# 2- Calculer la similarité entre deux mots
similarity_score = model.wv.similarity('morocco', 'marrakech')
print("Similarité entre 'morocco' et 'marrakech':\n", similarity_score)

Similarité entre 'morocco' et 'marrakech':
 -0.0076152626


In [35]:
# 3- Extraire les mots contextuels (les plus similaires) pour un mot central donné
similar_words = model.wv.most_similar('cultural', topn=5)
print("Mots contextuels les plus similaires à 'morocco':\n", similar_words)

Mots contextuels les plus similaires à 'morocco':
 [('djemaa', 0.3165621757507324), ('narrow', 0.2873810827732086), ('boutique', 0.2069958746433258), ('contemporary', 0.20449739694595337), ('berber', 0.18248510360717773)]
