### Lowercasing

In [27]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('data/IMDB Dataset.csv')

In [28]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [29]:
# picking out a random review and making it lowercased
df['review'][2].lower()


'i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). while some may be disappointed when they realize this is not match point 2: risk addiction, i thought it was proof that woody allen is still fully in control of the style many of us have grown to love.<br /><br />this was the most i\'d laughed at one of woody\'s comedies in years (dare i say a decade?). while i\'ve never been impressed with scarlet johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />this may not be the crown jewel of his career, but it was wittier than "devil wears prada" and more interesting than "superman" a great comedy to go see with friends.'

In [30]:
# making the entire review corpus lowercases
df['review']=df['review'].str.lower()

In [31]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove HTML tags

Using Regular Expression, `https://regex101.com/` -> helps in creating tailored regex

In [32]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text) 
remove_html_tags('https://abc<html><br>01.com/')

'https://abc01.com/'

In [33]:
# apply regex funtion too entire dataset

df['review']=df['review'].apply(remove_html_tags) 

### Remove URLs

In [34]:
text = 'Check out my notebook https:/ /www.kaggle.com/notebook8223abb'
text2 = 'Check qut my notebook http://www.kaggle.com/notebook822abb'
text = 'Google search here www.google.com'
text4 = 'For notebook click https://www.kaggle.com/notebookfclabb to search check www.google.com'

In [35]:
def remove_url(text):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        return pattern.sub(r'',text)
    


In [36]:
remove_url(text4)

'For notebook click  to search check '

### Remove Punctuations

In [37]:
import string,time 
print(string.punctuation) # list of symbols that python considers to be a punctuation marks
exclude = string.punctuation 

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [38]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

text = 'string. With. Punctuation?'
remove_punc(text)

'string With Punctuation'

In [39]:
# How much time it takes to remove Punctuation
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print('Time taken : ',time1) # which is very slow, when thinking in terms of large datasets

string With Punctuation
Time taken :  0.00014781951904296875


In [40]:
# Alterantive method can be : 
def remove_punc2(text):
    return text.translate(str.maketrans('','',exclude))

start = time.time()
print(remove_punc2(text))
time2 = time.time() - start
print('Time taken : ',time2) 

print(time1/time2)

string With Punctuation
Time taken :  5.0067901611328125e-05
2.9523809523809526


### Chat Word treatment

1. just search `sms slang translator` in google and get a list of slangs and their full forms
2. Put all that in python dictionary



In [41]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing",
}


In [42]:
def chat_conversation(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else : 
            new_text.append(w)
    return " ".join(new_text)


In [43]:
chat_conversation('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

### Spelling correction 

In [44]:


incorrect_text = "Thiss sentencas hes lats off incorract speallings"

from textblob import TextBlob
textblb = TextBlob(incorrect_text)
textblb.correct()

TextBlob("Hiss sentences he last off incorrect swellings")

### Stopwords removal

In [45]:
import nltk
nltk.download('stopwords')
from  nltk.corpus import stopwords
stopwords.words('english') 

# more stopwords can be seen : [nltk_data]     /Users/singhabhishekkk/nltk_data... where the nltk package is downloaded
# cd /Users/singhabhishekkk/nltk_data/corpora/stopwords/


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/singhabhishekkk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [46]:
def remove_stopwords(text):
    new_text =[]
    
    for words in text.split():
        if words in stopwords.words('english'):
            new_text.append("")
        else : 
            new_text.append(words)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [47]:
remove_stopwords("This is a Apple, which is a red fruit")

'This   Apple,    red fruit'

In [48]:
# applying remove_stopwords in entire dataframe
df['review'].apply(remove_stopwords)

0        one    reviewers  mentioned   watching  1 oz e...
1         wonderful little production.  filming techniq...
2         thought    wonderful way  spend time    hot s...
3        basically there's  family   little boy (jake) ...
4        petter mattei's "love   time  money"   visuall...
                               ...                        
49995     thought  movie    right good job.    creative...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997       catholic taught  parochial elementary schoo...
49998    i'm going    disagree   previous comment  side...
49999     one expects  star trek movies   high art,   f...
Name: review, Length: 50000, dtype: object

### Handling emojis
1. Remove emojis
2. Replace them

In [49]:
# 1. To remove emojis
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                                "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


remove_emoji('Loved the movie it was 😍 🎉')


'Loved the movie it was  '

In [50]:
# 2. To replace the emoji with a meaning
import emoji
print(emoji.demojize('Loved the movie it was 😍 🎉'))

Loved the movie it was :smiling_face_with_heart-eyes: :party_popper:


### Tokenization

In [51]:
# 1. using the split function in python
# word tokenizer
sentence1 = "I am going to Delhi"
sentence1.split()

['I', 'am', 'going', 'to', 'Delhi']

In [52]:
# sentence tokenization
sentence2 = "I am going to New Delhi. I will stay there for 3 days"
sentence2.split('.')

['I am going to New Delhi', ' I will stay there for 3 days']

In [53]:
# Problems with split function
sentence1 = "I am going to Delhi!"
print(sentence1.split())
sentence1.split()[4] == 'Delhi'

['I', 'am', 'going', 'to', 'Delhi!']


False

In [54]:
sentence2 = "I am going to New Delhi? I will stay there for 3 days"
sentence2.split('.')

['I am going to New Delhi? I will stay there for 3 days']

In [55]:
# 2. Using regular expressions
import re
print(sentence1)

tokens = re. findall("[\w']+", sentence1)
tokens

I am going to Delhi!


['I', 'am', 'going', 'to', 'Delhi']

In [56]:
import re
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled "it to make a type specimen book. """
sentences = re.compile('[.!?] '). split(text)
sentences


# regular expression are little bit better than .split() but they have their own problems

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 'Lorem Ipsum has been the industry\'s standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled "it to make a type specimen book',
 '']

In [57]:
# 3. Using NLTK -> best solution therefore is librairies

from nltk.tokenize import word_tokenize, sent_tokenize
print(word_tokenize(sentence1))


['I', 'am', 'going', 'to', 'Delhi', '!']


In [58]:
sent_tokenize(text)

# theses funtions of libraries have an internal algorithm which handle all your edge cases 
# and take care of problems too which arises in tokenization

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 'Lorem Ipsum has been the industry\'s standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled "it to make a type specimen book.']

In [59]:
sent1 = 'I have a Ph.D in A.I'
sent2 = 'He re here to hespe mail us at nks@gmail.com' # in this there are some problems 
# because the entire gmail id is broken
print(word_tokenize(sent1),word_tokenize(sent2))

['I', 'have', 'a', 'Ph.D', 'in', 'A.I'] ['He', 're', 'here', 'to', 'hespe', 'mail', 'us', 'at', 'nks', '@', 'gmail.com']


In [60]:
# 4. Spacy - - > try spacy first for tokenizing and then check for others
# in some cases nltk works best and in some spacy works better
import spacy
# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm') # small english model

In [61]:
# First need to import spacy
# load the model of your choice sm - small, md- medium, lg - large
#Then convert sentences to document
doc1 = nlp(sent2)

# then pickout the document and then print the tokens
for token in doc1:
    print(token)

He
re
here
to
hespe
mail
us
at
nks@gmail.com


###  Stemming

In [62]:
import nltk
from nltk.stem.porter import PorterStemmer

In [63]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])


In [64]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [65]:
sample2 = "The government's implementation of the policy was met with resistance from various stakeholders."
stem_words(sample2)


"the government' implement of the polici wa met with resist from variou stakeholders."

### Lemmatization

In [78]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
wordnet_lem = WordNetLemmatizer()

sample2 = "The government's implementation of the policy was met with resistance from various stakeholders."

sample2 = remove_punc(sample2)

# Tokenize the sentence into words
words = word_tokenize(sample2)
for word in words:
    # Use index 0 in the format string since we have only one argument to format
    print("{0:20}{0:20}".format(wordnet_lem.lemmatize(word,pos ='v'))) # specifying pos to be verb


The                 The                 
governments         governments         
implementation      implementation      
of                  of                  
the                 the                 
policy              policy              
be                  be                  
meet                meet                
with                with                
resistance          resistance          
from                from                
various             various             
stakeholders        stakeholders        


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/singhabhishekkk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
