<a href="https://colab.research.google.com/github/sujithrc/NLP_preprocessing/blob/main/nlp1_using_tok_sw_stem_limma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text data
text_data = [
    "Tokenization is the process of splitting text into words and punctuation.",
    "Stopwords are common words that are often filtered out in NLP tasks.",
    "Stemming reduces words to their base or root form, e.g., 'running' to 'run'.",
    "Lemmatization is similar to stemming but results in actual words, e.g., 'better' to 'good'."
]

# Tokenization
tokenized_text = [nltk.word_tokenize(text) for text in text_data]

# Stopword removal
stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word.lower() not in stop_words and word.lower() != 'not'] for tokens in tokenized_text]

# Stemming
stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered_text]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_text]

# Convert tokens back to strings for TF-IDF
preprocessed_text = [' '.join(tokens) for tokens in lemmatized_text]

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_text)

# Print the TF-IDF features
print(tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['actual' 'base' 'better' 'common' 'filtered' 'form' 'good'
 'lemmatization' 'nlp' 'often' 'process' 'punctuation' 'reduces' 'result'
 'root' 'run' 'running' 'similar' 'splitting' 'stemming' 'stopwords'
 'task' 'text' 'tokenization' 'word']
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.43551105 0.43551105
  0.         0.         0.         0.         0.         0.
  0.43551105 0.         0.         0.         0.43551105 0.43551105
  0.22726773]
 [0.         0.         0.         0.39928771 0.39928771 0.
  0.         0.         0.39928771 0.39928771 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.39928771 0.39928771 0.         0.
  0.20836489]
 [0.         0.38086157 0.         0.         0.         0.38086157
  0.         0.         0.         0.         0.         0.
  0.38086157 0.         0.38086157 0.38086157 0.38086157 0.
  0.         0.30027564 0.         0.         0

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text data
text_data = [
    "Tokenization is the process of splitting text into words and punctuation.",
    "Stopwords are common words that are often filtered out in NLP tasks.",
    "Stemming reduces words to their base or root form, e.g., 'running' to 'run'.",
    "Lemmatization is similar to stemming but results in actual words, e.g., 'better' to 'good'."
]

# Tokenization
tokenized_text = [nltk.word_tokenize(text) for text in text_data]

# Stopword removal
stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word.lower() not in stop_words and word.lower() != 'not'] for tokens in tokenized_text]

# Stemming
stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered_text]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_text]

# Convert tokens back to strings for TF-IDF
preprocessed_text = [' '.join(tokens) for tokens in lemmatized_text]

# TF-IDF Vectorization with max_features=4
tfidf_vectorizer = TfidfVectorizer(max_features=4)
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_text)

# Print the TF-IDF features
print(tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())


['stemming' 'task' 'text' 'word']
[[0.         0.         0.88654763 0.46263733]
 [0.         0.88654763 0.         0.46263733]
 [0.83388421 0.         0.         0.55193942]
 [0.83388421 0.         0.         0.55193942]]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
