## TP5: Text

In [None]:
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import FrenchStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from gensim.models import Word2Vec #fast text
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objects as go



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Création de la collection de documents de texte

In [4]:
documents = [
    "Le machine learning est un domaine de l'intelligence artificielle.",
    "Python est l'un des langages les plus utilisés en science des données.",
    "Les réseaux de neurones sont inspirés du fonctionnement du cerveau humain.",
    "Le traitement du langage naturel permet aux machines de comprendre le texte.",
    "La visualisation de données aide à interpréter les résultats des modèles.",
    "Le clustering regroupe des données similaires sans supervision."
]

## Préprocessing

In [5]:
stop_words = set(stopwords.words('french'))
stemmer = FrenchStemmer()


def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zàçéèê ]', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

preprocessed_docs = [preprocess(doc) for doc in documents]

for i, doc in enumerate(preprocessed_docs, 1):
    print(f"Doc {i} -> {doc}")


Doc 1 -> machin learning domain intelligent artificiel
Doc 2 -> python langag plus utilis scienc don
Doc 3 -> réseau neuron inspir fonction cerveau humain
Doc 4 -> trait langag naturel permet machin comprendr text
Doc 5 -> visualis don aid interpret résultat model
Doc 6 -> clustering regroup don similair san supervis


## Construction de la matrice Bag of Words

In [6]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(preprocessed_docs)

vocabulaire = vectorizer.get_feature_names_out()

df_bow = pd.DataFrame(X.toarray(), columns=vocabulaire)

print(df_bow)

   aid  artificiel  cerveau  clustering  comprendr  domain  don  fonction  \
0    0           1        0           0          0       1    0         0   
1    0           0        0           0          0       0    1         0   
2    0           0        1           0          0       0    0         1   
3    0           0        0           0          1       0    0         0   
4    1           0        0           0          0       0    1         0   
5    0           0        0           1          0       0    1         0   

   humain  inspir  ...  réseau  résultat  san  scienc  similair  supervis  \
0       0       0  ...       0         0    0       0         0         0   
1       0       0  ...       0         0    0       1         0         0   
2       1       1  ...       1         0    0       0         0         0   
3       0       0  ...       0         0    0       0         0         0   
4       0       0  ...       0         1    0       0         0         0  

## Construction de la matrice TF-IDF

In [7]:

vectorizer = TfidfVectorizer()

X_tfidf = vectorizer.fit_transform(preprocessed_docs)

vocabulaire = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vocabulaire)

print(df_tfidf) 

        aid  artificiel   cerveau  clustering  comprendr    domain       don  \
0  0.000000    0.462625  0.000000    0.000000   0.000000  0.462625  0.000000   
1  0.000000    0.000000  0.000000    0.000000   0.000000  0.000000  0.305018   
2  0.000000    0.000000  0.408248    0.000000   0.000000  0.000000  0.000000   
3  0.000000    0.000000  0.000000    0.000000   0.396999  0.000000  0.000000   
4  0.427206    0.000000  0.000000    0.000000   0.000000  0.000000  0.295760   
5  0.000000    0.000000  0.000000    0.427206   0.000000  0.000000  0.295760   

   fonction    humain    inspir  ...    réseau  résultat       san    scienc  \
0  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.440579   
2  0.408248  0.408248  0.408248  ...  0.408248  0.000000  0.000000  0.000000   
3  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
4  0.000000  0.000000  0.000000  ...  0

## Affichage du vocabulaire extrait

In [8]:
vocabulaire = vectorizer.get_feature_names_out()

print("Vocabulaire extrait :")
print(vocabulaire)

Vocabulaire extrait :
['aid' 'artificiel' 'cerveau' 'clustering' 'comprendr' 'domain' 'don'
 'fonction' 'humain' 'inspir' 'intelligent' 'interpret' 'langag'
 'learning' 'machin' 'model' 'naturel' 'neuron' 'permet' 'plus' 'python'
 'regroup' 'réseau' 'résultat' 'san' 'scienc' 'similair' 'supervis' 'text'
 'trait' 'utilis' 'visualis']


## Utiliser Word2Vec pour les mots et afficher dans un plot 3d

In [None]:

tokenized_docs = [doc.split() for doc in preprocessed_docs]

model = Word2Vec(sentences=tokenized_docs, vector_size=20, window=3, min_count=1, workers=1, seed=42)

words = list(model.wv.index_to_key)
word_vectors = model.wv[words]

pca = PCA(n_components=3)
word_vecs_3d = pca.fit_transform(word_vectors)

fig = go.Figure(data=[go.Scatter3d(
    x=word_vecs_3d[:, 0],
    y=word_vecs_3d[:, 1],
    z=word_vecs_3d[:, 2],
    mode='markers+text',
    marker=dict(size=6, color='blue'),
    text=words,
    textposition='top center'
)])

fig.update_layout(title="Word2Vec - Projection 3D des mots (Plotly)",
                  scene=dict(
                      xaxis_title='PC1',
                      yaxis_title='PC2',
                      zaxis_title='PC3'
                  ))

fig.show()

## MiniLM pour les phrases et afficher dans un plot 3d

In [12]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embeddings = model.encode(documents)

pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)

fig = go.Figure(data=[go.Scatter3d(
    x=embeddings_3d[:, 0],
    y=embeddings_3d[:, 1],
    z=embeddings_3d[:, 2],
    mode='markers+text',
    marker=dict(size=8, color='red'),
    text=[f"Doc {i+1}" for i in range(len(documents))],
    textposition='top center'
)])

fig.update_layout(title="MiniLM - Projection 3D des phrases (Plotly)",
                  scene=dict(
                      xaxis_title='PC1',
                      yaxis_title='PC2',
                      zaxis_title='PC3'
                  ))
fig.show()