In [21]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/muhammadsalmanmalik/nltk_data...


True

1. Lowercasing
Purpose: Ensures uniformity in text by converting all characters to lowercase.

In [12]:
text = "This is an Example."
lower_text = text.lower()
print(lower_text)  # Output: "this is an example."


this is an example.


2. Removing Punctuation
Purpose: Eliminates punctuation marks, as they often don’t carry meaning in text analysis.

In [2]:
import string
text = "Hello, world! Let's preprocess text."
no_punct = text.translate(str.maketrans('', '', string.punctuation))
print(no_punct)  # Output: "Hello world Lets preprocess text"


Hello world Lets preprocess text


3. Removing Numbers
Purpose: Removes numbers when they aren’t relevant to the analysis.

In [3]:
import re
text = "There are 123 apples."
no_numbers = re.sub(r'\d+', '', text)
print(no_numbers)  # Output: "There are  apples."


There are  apples.


4. Tokenization
Purpose: Splits text into individual tokens (words or sentences).

In [16]:
from nltk.tokenize import word_tokenize
text = "Tokenize this sentence."
tokens = word_tokenize(text)
print(tokens)  # Output: ['Tokenize', 'this', 'sentence', '.']


['Tokenize', 'this', 'sentence', '.']


5. Stopword Removal
Purpose: Eliminates common words like "the," "and," etc.

In [18]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
text = "This is an example of stopword removal."
filtered_words = [word for word in text.split() if word.lower() not in stop_words]
print(filtered_words)  # Output: ['example', 'stopword', 'removal']


['example', 'stopword', 'removal.']


6. Stemming
Purpose: Reduces words to their stems by chopping off suffixes.

In [19]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word)  # Output: "run"


run


7. Lemmatization
Purpose: Converts words to their dictionary forms (more linguistically correct than stemming).

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
word = "running"
lemma = lemmatizer.lemmatize(word, pos='v')
print(lemma)  # Output: "run"


run


8. Removing Whitespace
Purpose: Strips excessive spaces or newline characters.

In [23]:
text = "  Excessive   whitespace. \n"
clean_text = " ".join(text.split())
print(clean_text)  # Output: "Excessive whitespace."


Excessive whitespace.


9. Removing URLs
Purpose: Deletes hyperlinks from text.

In [24]:
import re
text = "Visit https://example.com for details."
no_urls = re.sub(r'http\S+', '', text)
print(no_urls)  # Output: "Visit for details."


Visit  for details.


10. Removing Special Characters
Purpose: Cleans non-alphanumeric characters.

In [11]:
import re
text = "Text #processing is @important!"
clean_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
print(clean_text)  # Output: "Text processing is important"


Text processing is important


11. Spelling Correction
Purpose: Corrects misspelled words.

In [29]:
from textblob import TextBlob
text = "Spelling errurs are common."
corrected_text = str(TextBlob(text).correct())
print(corrected_text)  # Output: "Spelling errors are common."


Spelling errors are common.


12. Word Normalization
Purpose: Maps variations of words to a standard form.

In [30]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
text = ["is", "are", "am"]
normalized = [lemmatizer.lemmatize(word, pos='v') for word in text]
print(normalized)  # Output: ['be', 'be', 'be']


['be', 'be', 'be']


13. Vectorization
Purpose: Converts text into numerical representations for machine learning models.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
texts = ["This is a sample.", "Text preprocessing is important."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
print(X.toarray())  # Output: Word count matrix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["This is a sample.", "Text preprocessing is important."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
print(X.toarray())  # Output: TF-IDF matrix
