### Function for Text Preprocessing

In [39]:
import re
from bs4 import BeautifulSoup

def preprocess(q):
    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    # q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "can not",
        "can't've": "can not have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    return q

In [41]:
import pandas as pd

df = pd.read_csv("C:\\Users\\shubh\\Desktop\\Important_IPYNB_files\\NLP\\Other_files\\IMDB_Dataset.csv")
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### 1. Lowercasing

In [42]:
# On a single row

df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [43]:
# On entire column

df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 2. Remove HTML Tags

In [44]:
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_html_tags)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### 3. Remove URLs

In [45]:
import re

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_url)

### 4. Remove punctuations

In [46]:
import string

exclude = string.punctuation

def remove_punctuations(text):
    return " ".join(text.translate(str.maketrans('', '', exclude)).split())

df['review'] = df['review'].apply(remove_punctuations)

In [36]:
t = '# this $ is my ^ name %% and this !! & ) is my hobby'

print(remove_punctuations(t))

this is my name and this is my hobby


### 5. Chat word treatment

In [47]:
chat_words = {
'AFAIK': 'As Far As I Know',
'AFK': 'Away From Keyboard',
'ASAP': 'As Soon As Possible',
'ATK': 'At The Keyboard',
'ATM': 'At The Moment',
'A3': 'Anytime, Anywhere, Anyplace',
'BAK': 'Back At Keyboard',
'BBL': 'Be Back Later',
'BBS': 'Be Back Soon',
'BFN': 'Bye For Now',
'B4N': 'Bye For Now',
'BRB': 'Be Right Back',
'BRT': 'Be Right There',
'BTW': 'By The Way',
'B4': 'Before',
'B4N': 'Bye For Now',
'CU': 'See You',
'CUL8R': 'See You Later',
'CYA': 'See You',
'FAQ': 'Frequently Asked Questions',
'FC': 'Fingers Crossed',
'FWIW': 'For What It Is Worth',
'FYI': 'For Your Information',
'GAL': 'Get A Life',
'GG': 'Good Game',
'GN': 'Good Night',
'GMTA': 'Great Minds Think Alike',
'GR8': 'Great',
'G9': 'Genius',
'IC': 'I See',
'ICQ': 'I Seek you',
'ILU': 'I Love You',
'IMHO': 'In My Humble Opinion',
'IMO': 'In My Opinion',
'IOW': 'In Other Words',
'IRL': 'In Real Life',
'KISS': 'Keep It Simple, Stupid',
'LDR': 'Long Distance Relationship',
'LMAO': 'Laugh My Ass Off',
'LOL': 'Laughing Out Loud',
'LTNS': 'Long Time No See',
'L8R': 'Later',
'MTE': 'My Thoughts Exactly',
'M8': 'Mate',
'NRN': 'No Reply Necessary',
'OIC': 'Oh I See',
'PITA': 'Pain In The Ass',
'PRT': 'Party',
'PRW': 'Parents Are Watching',
'ROFL': 'Rolling On The Floor Laughing',
'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
'ROTFLMAO': 'Rolling On The Floor Laughing My Ass Off',
'SK8': 'Skate',
'ASL': 'Age, Sex, Location',
'THX': 'Thank You',
'TTFN': 'Ta Ta For Now',
'TTYL': 'Talk To You Later',
'U': 'You',
'U2': 'You Too',
'U4E': 'Yours For Ever',
'WB': 'Welcome Back',
'WTF': 'What The Fuck',
'WTG': 'Way To Go',
'WUF': 'Where Are You From',
'W8': 'Wait',
'TFW': 'That feeling when',
'MFW': 'My face when',
'MRW': 'My reaction when',
'IFYP': 'I feel your pain',
'LOL': 'Laughing out loud',
'TNTL': 'Trying not to laugh',
'JK': 'Just kidding',
'IDC': 'I do not care',
'ILY': 'I love you',
'IMU': 'I miss you',
'ADIH': 'Another day in hell',
'IDC': 'I do not care',
'ZZZ': 'Sleeping, bored, tired',
'WYWH': 'Wish you were here',
'BAE': 'Before anyone else',
'FIMH': 'Forever in my heart',
'BSAAW': 'Big smile and a wink',
'BWL': 'Bursting with laughter',
'BFF': 'Best friends forever',
'CSL': 'Cannot stop laughing',
}

def chat_words_conversion(text):
    new_text = []
    
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
            
    return ' '.join(new_text)

# df['review'] = df['review'].apply(chat_words_conversion)

In [48]:
print(chat_words_conversion('ROTFLMAO'))

Rolling On The Floor Laughing My Ass Off


### 6. Spelling correction

In [49]:
from textblob import TextBlob

incorrect_text = 'certaaain condiitns durrnng seveal genertions are moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)
print(str(textBlb.correct()))

certain conditions during several generations are modified in the same manner.


### 7. Removing stopwords

In [50]:
from nltk.corpus import stopwords

STOP_WORDS = stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in STOP_WORDS:
            new_text.append('')
        else:
            new_text.append(word)
            
    x = new_text[:]
    new_text.clear()
    return " ".join(x)


# df['review'] = df['review'].apply(remove_stopwords)

sent = "Data science is the field of study that combines domain expertise, programming skills, and knowledge of mathematics and statistics to extract meaningful insights from data."
print(remove_stopwords(sent))

Data science   field  study  combines domain expertise, programming skills,  knowledge  mathematics  statistics  extract meaningful insights  data.


### 8. Remove and Replace emojis

In [51]:
# Removing Emojis

import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

print(remove_emoji("Loved the movie. It was 😘😘"))

Loved the movie. It was 


In [52]:
# Replacing Emojis

import emoji

print(emoji.demojize('Python is 🔥'))
print(emoji.demojize('Loved the movie. It was 😘'))

Python is :fire:
Loved the movie. It was :face_blowing_a_kiss:


### 9. Tokenization

In [53]:
sent1 = 'I am going to delhi!'
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent3 = 'Where do think I should go? I have 3 day holiday'

text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent4 = 'I have a Ph.D in A.I'
sent5 = "We're here to help! mail us at nks@gmail.com"
sent6 = 'A 5km ride cost $10.50'

#### 9.a. NLTK

In [80]:
# Word Tokenize

from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# nltk.download('punkt')

word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

In [81]:
# Sentence Tokenize

from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# nltk.download('punkt')

sent_tokenize(text)

['probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie']

#### 9.b. Spacy (better than NLTK)

In [56]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent3)
doc4 = nlp(sent1)

for token in doc4:
    print(token)

I
am
going
to
delhi
!


### 10. Stemming

In [57]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [58]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [59]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
print("Before Stemming:" + '\n\n' + text)
print()
print("After Stemming:" + '\n\n' + stem_words(text))

Before Stemming:

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie

After Stemming:

probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say 

### 11. Lemmatization

In [60]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"

sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

# sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))  # pos='v' --> Parts Of Speech = Verb

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
