<a href="https://colab.research.google.com/github/shambhavithakur/nlp-text-preprocessing/blob/main/nlp-text-preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing Techniques
## Preprocessing text






In [32]:
!pip install autocorrect



In [33]:
from autocorrect import Speller
from nltk import word_tokenize, download

download(['punkt', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [34]:
import re

In [35]:
# Getting text from file

with open('/content/file.txt') as f:
    sentence_list = [line.strip() for line in f.readlines()]
    sentences = ' '.join(sentence_list)
    sentences = re.sub(r'\s*[.]\s*', '. ', re.sub(r'\s*[,]\s*',\
                            ', ', sentences)).strip()    

In [36]:
# Converting text to tokens

tokens = word_tokenize(sentences)

In [37]:
# Printing first 20 tokens

tokens[:20]

['The',
 'reader',
 'of',
 'this',
 'course',
 'should',
 'have',
 'a',
 'basic',
 'knowledge',
 'of',
 'the',
 'Python',
 'programming',
 'lenguage',
 '.',
 'He/she',
 'must',
 'have',
 'knowldge']

In [38]:
# Preparing to autocorrect tokens
# Creating a spelling-corrector instance

spell = Speller(lang='en')

In [39]:
# Autocorrecting the tokens

tokens_corrected = [spell(word) for word in tokens]

In [40]:
# Printing the first 20 corrected tokens

tokens_corrected[:20]

['The',
 'reader',
 'of',
 'this',
 'course',
 'should',
 'have',
 'a',
 'basic',
 'knowledge',
 'of',
 'the',
 'Python',
 'programming',
 'language',
 '.',
 'He/she',
 'must',
 'have',
 'knowledge']

In [41]:
# Combining the tokens

sentences_corrected = ' '.join(tokens_corrected)

In [42]:
sentences_corrected = re.sub(r'\s*[.]\s*', '. ',re.sub(r'\s*[,]\s*',\
                            ', ', sentences_corrected)).strip()

In [43]:
sentences_corrected

'The reader of this course should have a basic knowledge of the Python programming language. He/she must have knowledge of data types in Python. He should be able to write functions, and also have the ability to import and use libraries and packages in Python. Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.'

In [44]:
# Applying parts-of-speech (PoS) tags to the corrected tokens

from nltk import pos_tag

In [45]:
pos_tagged = pos_tag(tokens_corrected)

In [46]:
# Printing first 5 tokens tagges with PoS

pos_tagged[:5]

[('The', 'DT'),
 ('reader', 'NN'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('course', 'NN')]

In [47]:
# Preparing to remove stopwords 
# Importing relevant module

download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [48]:
# Saving stopwords in a variable

stop_words = stopwords.words('english')

In [49]:
# Tokens without stopwords

tokens_no_stopwords = [word for word in tokens_corrected if word not in stop_words]

In [50]:
# Printing the first 20 tokens with stopwords removed

tokens_no_stopwords[:20]

['The',
 'reader',
 'course',
 'basic',
 'knowledge',
 'Python',
 'programming',
 'language',
 '.',
 'He/she',
 'must',
 'knowledge',
 'data',
 'types',
 'Python',
 '.',
 'He',
 'able',
 'write',
 'functions']

In [51]:
# Stemming or converting words into their base versions

from nltk import stem

stemmer = stem.SnowballStemmer('english')

tokens_stemmed = [stemmer.stem(word) for word in tokens_no_stopwords]

In [52]:
tokens_stemmed[:20]

['the',
 'reader',
 'cours',
 'basic',
 'knowledg',
 'python',
 'program',
 'languag',
 '.',
 'he/sh',
 'must',
 'knowledg',
 'data',
 'type',
 'python',
 '.',
 'he',
 'abl',
 'write',
 'function']

In [53]:
# Preparing to lemmatize the tokens

download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [54]:
# Creating a lemmatizer instance

lemmatizer = WordNetLemmatizer()

In [55]:
# Lemmatizing the tokens

tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]

In [56]:
# Printing the first 20 lemmatized tokens

tokens_lemmatized[:20]

['The',
 'reader',
 'course',
 'basic',
 'knowledge',
 'Python',
 'programming',
 'language',
 '.',
 'He/she',
 'must',
 'knowledge',
 'data',
 'type',
 'Python',
 '.',
 'He',
 'able',
 'write',
 'function']

In [57]:
# Combining the lemmatized tokens into a text chunk

sentences_lemmatized = ' '.join(tokens_lemmatized)
sentences_lemmatized = re.sub(r'\s*[.]\s*', '. ',re.sub(r'\s*[,]\s*',\
                            ', ', sentences_lemmatized)).strip()

sentences_lemmatized

'The reader course basic knowledge Python programming language. He/she must knowledge data type Python. He able write function, also ability import use library package Python. Familiarity basic linguistics probability assumed although required fully complete course.'

In [58]:
# Detecting sentence boundaries

from nltk.tokenize import sent_tokenize

In [59]:
sentence_boundaries = sent_tokenize(sentences_corrected)

In [60]:
len(sentence_boundaries)

4

In [61]:
sentence_boundaries

['The reader of this course should have a basic knowledge of the Python programming language.',
 'He/she must have knowledge of data types in Python.',
 'He should be able to write functions, and also have the ability to import and use libraries and packages in Python.',
 'Familiarity with basic linguistics and probability is assumed although not required to fully complete this course.']