<a href="https://colab.research.google.com/github/rouakhadhraoui/Text-Mining-Labs-/blob/main/1_Text_Pre_processing_and_Text_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# Download required resources (only once)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab") # Added this line to download the missing resource

corpus = """Text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
The data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections."""

print("Original Corpus:\n", corpus)

# Step 1: Lowercase all corpus words
corpus_lower = corpus.lower()
print("\nStep 1 - Lowercase:\n", corpus_lower)

# Step 2: Split into sentences, then into words
sentences = sent_tokenize(corpus_lower)   # sentence tokenization
print("\nStep 2 - Tokenized sentences:\n", sentences)

words = [word_tokenize(sent) for sent in sentences]  # word tokenization
print("\nStep 2 - Tokenized words:\n", words)

# Step 3: Remove punctuation
words_no_punc = [[w for w in sent if w not in string.punctuation] for sent in words]
print("\nStep 3 - Without punctuation:\n", words_no_punc)

# Step 4: Remove Stopwords
stop_words = set(stopwords.words("english"))
words_no_stop = [[w for w in sent if w not in stop_words] for sent in words_no_punc]
print("\nStep 4 - Without stopwords:\n", words_no_stop)

# Step 5: Stemming vs Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming and lemmatization
stemming = [[stemmer.stem(w) for w in sent] for sent in words_no_stop]
lemmatization = [[lemmatizer.lemmatize(w) for w in sent] for sent in words_no_stop]

print("\nStep 5 - Stemming:\n", stemming)
print("\nStep 5 - Lemmatization:\n", lemmatization)

Original Corpus:
 Text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
The data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections.

Step 1 - Lowercase:
 text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
the data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections.

Step 2 - Tokenized sentences:
 ['text mining seeks to extract useful information from data sources\nthrough the identification and exploration of interesting patterns.', 'the data sources are document collections, and interesting patterns are found\nnot among formalized database records but in the unstruct

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import string

corpus = """Text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
The data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections."""
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit([corpus]) # Put the corpus in a list
# encode document
newvector = vectorizer.transform([corpus]) # Put the corpus in a list
# summarize encoded vector
print(newvector.toarray())

[[1 2 2 1 2 3 1 1 1 1 1 1 1 1 1 3 1 2 1 1 1 2 1 1 2 1 1 4 1 1 1 1 1]]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = """Text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
The data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections."""

# Créer le vectorizer
vectorizer = CountVectorizer()

# Construire le vocabulaire et transformer le texte en vecteur
vector = vectorizer.fit_transform([corpus])

# Obtenir le vocabulaire (mots uniques)
words = vectorizer.get_feature_names_out()

# Obtenir le nombre d'occurrences
counts = vector.toarray().flatten()  # transforme en 1D

# Associer chaque mot à son nombre d'occurrences
word_counts = dict(zip(words, counts))

# Afficher le résultat
for word, count in word_counts.items():
    print(f"{word}: {count}")


among: 1
and: 2
are: 2
but: 1
collections: 2
data: 3
database: 1
document: 1
documents: 1
exploration: 1
extract: 1
formalized: 1
found: 1
from: 1
identification: 1
in: 3
information: 1
interesting: 2
mining: 1
not: 1
of: 1
patterns: 2
records: 1
seeks: 1
sources: 2
text: 1
textual: 1
the: 4
these: 1
through: 1
to: 1
unstructured: 1
useful: 1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string

corpus = """Text mining seeks to extract useful information from data sources
through the identification and exploration of interesting patterns.
The data sources are document collections, and interesting patterns are found
not among formalized database records but in the unstructured textual data
in the documents in these collections."""

# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit([corpus])
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform([corpus])
# summarize encoded vector
print(vector.toarray())

{'text': 25, 'mining': 18, 'seeks': 23, 'to': 30, 'extract': 10, 'useful': 32, 'information': 16, 'from': 13, 'data': 5, 'sources': 24, 'through': 29, 'the': 27, 'identification': 14, 'and': 1, 'exploration': 9, 'of': 20, 'interesting': 17, 'patterns': 21, 'are': 2, 'document': 7, 'collections': 4, 'found': 12, 'not': 19, 'among': 0, 'formalized': 11, 'database': 6, 'records': 22, 'but': 3, 'in': 15, 'unstructured': 31, 'textual': 26, 'documents': 8, 'these': 28}
[[0.11043153 0.22086305 0.22086305 0.11043153 0.22086305 0.33129458
  0.11043153 0.11043153 0.11043153 0.11043153 0.11043153 0.11043153
  0.11043153 0.11043153 0.11043153 0.33129458 0.11043153 0.22086305
  0.11043153 0.11043153 0.11043153 0.22086305 0.11043153 0.11043153
  0.22086305 0.11043153 0.11043153 0.4417261  0.11043153 0.11043153
  0.11043153 0.11043153 0.11043153]]


In [None]:
!pip install gensim



In [None]:
from gensim.models import Word2Vec

# define tokenized sentences as training data
tokenized_sentences = [corpus.split(),
corpus.split() ]
# training word2vec model
model = Word2Vec(tokenized_sentences,min_count=1)
# summarizing the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.index_to_key)
print(words)
# access word vector for one word "sources"
print(model.wv['sources'])
# try finding most similar words for word "interesting"
print(model.wv.most_similar('interesting'))

Word2Vec<vocab=36, vector_size=100, alpha=0.025>
['in', 'the', 'data', 'sources', 'are', 'and', 'interesting', 'collections.', 'exploration', 'identification', 'through', 'from', 'information', 'useful', 'extract', 'to', 'seeks', 'mining', 'of', 'patterns.', 'these', 'The', 'document', 'collections,', 'patterns', 'found', 'not', 'among', 'formalized', 'database', 'records', 'but', 'unstructured', 'textual', 'documents', 'Text']
[-8.2059642e-03  9.2979902e-03 -1.9116134e-04 -1.9303083e-03
  4.5893337e-03 -4.1022827e-03  2.7695557e-03  6.9844709e-03
  6.0466342e-03 -7.5571863e-03  9.3764113e-03  4.6535148e-03
  3.9848424e-03 -6.2149838e-03  8.4709413e-03 -2.1671467e-03
  8.8483533e-03 -5.3912718e-03 -8.1389090e-03  6.7731421e-03
  1.6736966e-03 -2.2028650e-03  9.5166238e-03  9.4885398e-03
 -9.7884899e-03  2.5083374e-03  6.1256420e-03  3.8669235e-03
  2.0322327e-03  4.6207427e-04  7.0411951e-04 -3.8652227e-03
 -7.1346560e-03 -2.1369199e-03  3.8962394e-03  8.8493060e-03
  9.2667416e-03 -5.