In [118]:
# loading the libraries

import pandas as pd

## 1. Lowercasing

In [117]:
imdb = pd.read_csv('imdb_dataset.csv')

In [15]:
imdb.review[3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [16]:
imdb['review'] = imdb.review.str.lower()

In [17]:
imdb.review[3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## 2. Removing HTML tags

In [18]:
import re

def rm_html_tags(txt):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', txt)

In [19]:
# testing function
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

rm_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [20]:
imdb['review'] = imdb.review.apply(rm_html_tags)

In [21]:
imdb.review[3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## 3. Remove URLs

In [30]:
def rm_url(txt):
    pattern = re.compile(r"https?://\S+|www.\S+")
    # pattern = re.compile('(?:https?\:\/\/)?w{3}\..*?(?=[\s\'])')
    return pattern.sub('', txt)

In [22]:
text1 = 'Check out my notebook https://www.kaggle.com'
text2 = 'Check out my notebook http://www.kaggle.com'
text3 = 'Google search here www.google.com'
text4 = 'For notebook click https://www.kaggle.com to search check www.google.com'

In [31]:
for t in [text1, text2, text3, text4]:
    print(rm_url(t))

Check out my notebook 
Check out my notebook 
Google search here 
For notebook click  to search check 


## 4. Removing Punct

In [32]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
def rm_punct(txt):
    for c in string.punctuation:
        txt = txt.replace(c, "")
    return txt

In [34]:
text = 'string. With. Punctuation?'
rm_punct(text)

'string With Punctuation'

In [36]:
def fast_rm_punct(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

fast_rm_punct(text)

'string With Punctuation'

## 5. Short Chat word treatment

treatment of short chat words like ikr, rofl, afaik

In [38]:
# !pip install contractions

In [79]:
with open('slang.txt') as sf:
    slines = sf.readlines()

slangs = {
    re.match("\w+(?=\=)", line).group(0): re.match("\w+=(.*?)\n", line).group(1) 
    for line in slines
}

slangs

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [80]:
def chat_slang_convertor(txt):
    expanded_txt = []
    for w in txt.split():
        if w in slangs:
            w = slangs[w]
        expanded_txt.append(w)
    return " ".join(expanded_txt)

In [81]:
chat_slang_convertor('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [82]:
chat_slang_convertor('FYI delhi is the capital of india')

'For Your Information delhi is the capital of india'

## 6. Spelling Correction

In [84]:
# !pip install textblob

In [86]:
from textblob import TextBlob

In [87]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

txtblb = TextBlob(incorrect_text)

In [88]:
txtblb

TextBlob("ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.")

In [89]:
txtblb.correct()

TextBlob("certain conditions during several generations are modified in the same manner.")

In [91]:
txtblb.correct().string

'certain conditions during several generations are modified in the same manner.'

## 7. Removing Stop words

In [109]:
import nltk
from nltk.corpus import stopwords

In [112]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [113]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [115]:
from collections import Counter

stop_words = stopwords.words('english')

def rm_stopwords(txt):
    stop_words_dict = Counter(stop_words)
    return " ".join([word for word in txt.split() if word not in stop_words_dict])


In [116]:
rm_stopwords('probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

'probably all-time favorite movie, story selflessness, sacrifice dedication noble cause, preachy boring. never gets old, despite seen 15 times'

In [119]:
imdb.review.apply(rm_stopwords)

0        One reviewers mentioned watching 1 Oz episode ...
1        A wonderful little production. <br /><br />The...
2        I thought wonderful way spend time hot summer ...
3        Basically there's family little boy (Jake) thi...
4        Petter Mattei's "Love Time Money" visually stu...
                               ...                        
49995    I thought movie right good job. It creative or...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I Catholic taught parochial elementary schools...
49998    I'm going disagree previous comment side Malti...
49999    No one expects Star Trek movies high art, fans...
Name: review, Length: 50000, dtype: object

## 8. Handling Emojis

In [120]:
import emoji

In [123]:
def rm_emoji(txt):
    return emoji.replace_emoji(txt, '')

In [124]:
rm_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [125]:
def convert_emoji(txt):
    return emoji.demojize(txt)

In [126]:
convert_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was :face_blowing_a_kiss::face_blowing_a_kiss:'

## 9. Tokenize

### NLP

In [4]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [7]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [8]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'

### Using Spacy

In [10]:
import spacy

In [12]:
# !python -m spacy download en_core_web_sm

In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [15]:
doc1

I have a Ph.D in A.I

In [16]:
for token in doc1:
    print(token)

I
have
a
Ph
.
D
in
A.I


In [17]:
for token in doc2:
    print(token)

We
're
here
to
help
!
mail
us
at
nks@gmail.com


## 10. Stemming

In [19]:
from nltk.stem.porter import PorterStemmer

In [20]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [21]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [23]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
print(text)
stem_words(text)

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie


'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

## 11. Lemmatization

In [31]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


True

In [26]:
import string
sentence = 'He was running and eating at the same time. He has bad habit of swimming after playing long hours in the sun'
punct = string.punctuation

In [27]:
sent_word = word_tokenize(sentence)

In [28]:
words = [word for word in sent_word if word not in punct]

In [37]:
lemmas = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]
# while lemmaitzing we also need to provide the part of speech (pos) to which we need to lemmatize

In [38]:
lemmas

['He',
 'be',
 'run',
 'and',
 'eat',
 'at',
 'the',
 'same',
 'time',
 'He',
 'have',
 'bad',
 'habit',
 'of',
 'swim',
 'after',
 'play',
 'long',
 'hours',
 'in',
 'the',
 'sun']