NLP Data Pre-processing using techniques like **tokenization**,**lemmatization** and **other techniques**.


In [14]:
import nltk
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
import spacy

# Setting the NLTK_DATA environment variable
os.environ['NLTK_DATA'] = '/root/nltk_data'

# Downloading the 'punkt' and 'punkt_tab' datasets to the specified directory
nltk.download('punkt', download_dir='/root/nltk_data/')
nltk.download('punkt_tab', download_dir='/root/nltk_data/')
nltk.download('stopwords',download_dir='/root/nltk_data/')
nltk.download('wordnet',download_dir='/root/nltk_data/')
nltk.download('averaged_perceptron_tagger_eng',download_dir='/root/nltk_data/')

# loading spacy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data/...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data/...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [15]:
# sample text
text = """ Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation. Natural language processing has its roots in the 1950s.[1] Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language."""

In [16]:
# pre-processing steps

text = text.lower()

# removing special characters
text = re.sub(r'[^a-zA-Z\s]','',text)

# tokenization
tokens = word_tokenize(text)

# removing stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
print("Filtered tokens:",tokens)

# stemming using PorterStemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed tokens:",stemmed_tokens)

# lemmatization using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized tokens:",lemmatized_tokens)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS tags:",pos_tags)

# alternative pipeline using spacy library
doc = nlp(text)
for token in doc:
    print(f"{token.text} - POS: {token.pos_} | Lemma: {token.lemma_}")

Filtered tokens: ['natural', 'language', 'processing', 'nlp', 'subfield', 'computer', 'science', 'especially', 'artificial', 'intelligence', 'primarily', 'concerned', 'providing', 'computers', 'ability', 'process', 'data', 'encoded', 'natural', 'language', 'thus', 'closely', 'related', 'information', 'retrieval', 'knowledge', 'representation', 'computational', 'linguistics', 'subfield', 'linguistics', 'major', 'tasks', 'natural', 'language', 'processing', 'speech', 'recognition', 'text', 'classification', 'naturallanguage', 'understanding', 'naturallanguage', 'generation', 'natural', 'language', 'processing', 'roots', 'already', 'alan', 'turing', 'published', 'article', 'titled', 'computing', 'machinery', 'intelligence', 'proposed', 'called', 'turing', 'test', 'criterion', 'intelligence', 'though', 'time', 'articulated', 'problem', 'separate', 'artificial', 'intelligence', 'proposed', 'test', 'includes', 'task', 'involves', 'automated', 'interpretation', 'generation', 'natural', 'langu