## Understand & perform basic Text preprocessing technique and extering meaningful info from it


## Exploring Tokenization

In [None]:
## Splitting sentences
import nltk
data = "Virat Kohli is great #cricketer"
data.split()

['Virat', 'Kohli', 'is', 'great', '#cricketer']

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
## Splitting sentences in the paragraph
from nltk.tokenize import sent_tokenize

text = "Hello everyone. I’m student of guni, I like to guni campus"
sent_tokenize(text)

['Hello everyone.', 'I’m student of guni, I like to guni campus']

In [None]:
## Splitting words in a sentence
from nltk.tokenize import word_tokenize

text = "I’m student of guni."
word_tokenize(text)

['I', '’', 'm', 'student', 'of', 'guni', '.']

## Regular-Expression Tokenizers

In [None]:
from nltk.tokenize import RegexpTokenizer

In [None]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s)

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [None]:
from nltk.tokenize import RegexpTokenizer
s = "We'll - A Rolex watch costs in the range of $3000.0 - $8000.0 in USA."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s)

['We',
 "'ll",
 '-',
 'A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$3000.0',
 '-',
 '$8000.0',
 'in',
 'USA',
 '.']

## Blank line tokenizer

In [None]:
from nltk.tokenize import BlanklineTokenizer
a = "My name is Rajan\n\nI am 6th sem student of U.V Patel college"
BlanklineTokenizer().tokenize(a)

['My name is Rajan', 'I am 6th sem student of U.V Patel college']

## Word punctuation tokenization

In [None]:
from nltk.tokenize import wordpunct_tokenize
a2 = "I have Iphone 14 pro max cost of ₹1,35,490."
wordpunct_tokenize(a2)

['I',
 'have',
 'Iphone',
 '14',
 'pro',
 'max',
 'cost',
 'of',
 '₹',
 '1',
 ',',
 '35',
 ',',
 '490',
 '.']

## TreebankWordTokenizer

In [None]:
from nltk.tokenize import TreebankWordTokenizer
a3 = "I does't want to go USA."
TreebankWordTokenizer().tokenize(a3)

['I', "does't", 'want', 'to', 'go', 'USA', '.']

## Tweet Tokenizer

In [None]:
from nltk.tokenize import TweetTokenizer
twt = "Hi:), {Good Afternoon}"
TweetTokenizer().tokenize(twt)

['Hi', ':)', ',', '{', 'Good', 'Afternoon', '}']

In [None]:
a4 = "I does't want to go USA:)."
print(tokenizer.tokenize(a4))
print(wordpunct_tokenize(a4))
print(TreebankWordTokenizer().tokenize(a4))
print(TweetTokenizer().tokenize(a4))

['I', 'does', "'t", 'want', 'to', 'go', 'USA', ':).']
['I', 'does', "'", 't', 'want', 'to', 'go', 'USA', ':).']
['I', "does't", 'want', 'to', 'go', 'USA', ':', ')', '.']
['I', "does't", 'want', 'to', 'go', 'USA', ':)', '.']


# Porter Stemmer


In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',
           'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener


#Snowball Stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',
           'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


In [None]:
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [None]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [None]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


## WordNet


In [None]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])
print("The lemmatized output is: ", lemmatized_output)

The tokens are:  ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']
The lemmatized output is:  We are putting in effort to enhance our understanding of Lemmatization


## stopwords

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


"an, nor, we, by, all, should've, his, didn, it, above, few, weren't, shouldn't, each, further, you, my, been, those, do, or, will, just, there, her, when, to, for, our, of, are, needn't, so, y, about, while, won't, more, yours, herself, and, don't, ve, in, doing, it's, at, once, not, d, mightn, from, t, you'd, before, most, weren, mightn't, ll, ma, how, am, very, any, itself, can, having, both, only, ain, with, couldn, but, between, should, which, needn, couldn't, they, yourselves, did, haven, same, does, who, were, here, where, during, why, into, don, no, through, she's, themselves, isn, hers, a, being, on, didn't, ourselves, you've, you're, doesn't, shouldn, be, has, theirs, i, hadn't, than, the, over, under, shan, up, shan't, she, hadn, myself, you'll, m, is, own, such, your, that'll, wouldn, if, s, their, out, then, now, below, some, wouldn't, hasn, he, wasn't, have, haven't, o, aren't, him, hasn't, them, what, these, after, as, aren, that, whom, against, other, wasn, won, me, was

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
a=nltk.corpus.stopwords.words('english')
print(set(a))

{'an', 'nor', 'we', 'by', 'all', "should've", 'his', 'didn', 'it', 'above', 'few', "weren't", "shouldn't", 'each', 'further', 'you', 'my', 'been', 'those', 'do', 'or', 'will', 'just', 'there', 'her', 'when', 'to', 'for', 'our', 'of', 'are', "needn't", 'so', 'y', 'about', 'while', "won't", 'more', 'yours', 'herself', 'and', "don't", 've', 'in', 'doing', "it's", 'at', 'once', 'not', 'd', 'mightn', 'from', 't', "you'd", 'before', 'most', 'weren', "mightn't", 'll', 'ma', 'how', 'am', 'very', 'any', 'itself', 'can', 'having', 'both', 'only', 'ain', 'with', 'couldn', 'but', 'between', 'should', 'which', 'needn', "couldn't", 'they', 'yourselves', 'did', 'haven', 'same', 'does', 'who', 'were', 'here', 'where', 'during', 'why', 'into', 'don', 'no', 'through', "she's", 'themselves', 'isn', 'hers', 'a', 'being', 'on', "didn't", 'ourselves', "you've", "you're", "doesn't", 'shouldn', 'be', 'has', 'theirs', 'i', "hadn't", 'than', 'the', 'over', 'under', 'shan', 'up', "shan't", 'she', 'hadn', 'myself

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom', 'not']
a1=[i for i in a if i not in wh_words]
print(a1)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should

In [None]:
sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"
a2=[i for i in a if i not in sentence]
print(a2)

['me', 'my', 'myself', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'the', 'but', 'if', 'because', 'as', 'until', 'while', 'by', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'out', 'off', 'over', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'll', 've', 'y', 'a

In [None]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
stop = set(stopwords.words('english'))
sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"
for word in wh_words:
    stop.remove(word)
sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]
" ".join(sentence_after_stopword_removal)

'how putting efforts enhance understanding Lemmatization'

## N-grams

In [None]:
from nltk.util import ngrams
text = "Natural languages are different from computer programming languages"
tokens = text.split()
bigrams = list(ngrams(tokens, 2))
[" ".join(token) for token in bigrams]

['Natural languages',
 'languages are',
 'are different',
 'different from',
 'from computer',
 'computer programming',
 'programming languages']

In [None]:
text = "Natural languages are different from computer programming languages"
tokens = text.split()
trigrams = list(ngrams(tokens, 3))
[" ".join(token) for token in trigrams]

['Natural languages are',
 'languages are different',
 'are different from',
 'different from computer',
 'from computer programming',
 'computer programming languages']

## **zomato_reviews**

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv("/content/zomato_reviews.csv")
df.head(3)

Unnamed: 0,Review,sentiment
0,Virat Kohli did a great thing to open his rest...,positive
1,This place have some really heathy options to ...,positive
2,Aerocity is the most finest place in Delhi for...,positive


In [None]:
corpus = pd.Series(df.Review.tolist()).astype(str)

In [None]:
corpus

0       Virat Kohli did a great thing to open his rest...
1       This place have some really heathy options to ...
2       Aerocity is the most finest place in Delhi for...
3       Yesterday evening there was small team lunch ,...
4       I find aerocity to be the best place in delhi ...
                              ...                        
1591    || DESI LANE || So we were at alipore's most h...
1592    "Desi Lane" is one of the most trending place ...
1593    One of the cool and pocket pinch restaurant at...
1594    "DESI LANE" one of the best places in town and...
1595    Looking for good place for lunch but dont wann...
Length: 1596, dtype: object

### Text Cleaning (Removal of special characters/punctuations & case folding)

In [None]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)

    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process

    Output : Returns the cleaned text corpus

    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x) for x in x] for x in corpus]
    return corpus

In [None]:
df = pd.read_csv("/content/zomato_reviews.csv")
df.head(3)

Unnamed: 0,Review,sentiment
0,Virat Kohli did a great thing to open his rest...,positive
1,This place have some really heathy options to ...,positive
2,Aerocity is the most finest place in Delhi for...,positive


In [None]:
corpus_with_lemmatization = preprocess(a1[0], keep_list = common_dot_words, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = False)
print(corpus_with_lemmatization)

NameError: ignored

In [None]:
print("Original string: ", corpus[2])

Original string:  Aerocity is the most finest place in Delhi for luxury dining as the place is too clean and it's a place with good view. I went to this restaurant Yesterday and it was all over a great Experience over there. We had Veg Pizza Mocktails Shakes Andhra paneer Mock meat platter Chocolate brownie Cheese cake The taste was very delicious and good the food presentation was also appreciable and the staff also very cooperative and helpful. And the outdoor sitting space is too good as it gives an amazing view.
