In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Étape de prétraitement des données
def preprocess(text):
    # Supprimer la ponctuation et les chiffres
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])

    # Mettre en minuscules
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Rejoindre les tokens en une seule chaîne de texte
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

# Étape de représentation des données
def represent(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()

    return X, feature_names

In [5]:
# Textes d'exemple
texts = [
    "Text mining is the process of extracting useful information from unstructured text data.",
    "The preprocessing step involves cleaning and preparing the text for analysis.",
    "Representation of the data is important to make it suitable for machine learning algorithms.",
]

# Prétraitement des textes
preprocessed_texts = [preprocess(text) for text in texts]

# Représentation des données
X, feature_names = represent(preprocessed_texts)

# Affichage des résultats
print("Textes d'origine :")
for text in texts:
    print("- ", text)
print("\nTextes prétraités :")
for text in preprocessed_texts:
    print("- ", text)
print("\nMatrice de termes :")
print(X.toarray())
print("\nListe des termes :")
print(feature_names)

Textes d'origine :
-  Text mining is the process of extracting useful information from unstructured text data.
-  The preprocessing step involves cleaning and preparing the text for analysis.
-  Representation of the data is important to make it suitable for machine learning algorithms.

Textes prétraités :
-  text mining process extracting useful information unstructured text data
-  preprocessing step involves cleaning preparing text analysis
-  representation data important make suitable machine learning algorithm

Matrice de termes :
[[0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 2 1 1]
 [0 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0]
 [1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0]]

Liste des termes :
['algorithm' 'analysis' 'cleaning' 'data' 'extracting' 'important'
 'information' 'involves' 'learning' 'machine' 'make' 'mining' 'preparing'
 'preprocessing' 'process' 'representation' 'step' 'suitable' 'text'
 'unstructured' 'useful']
