# Lowercasing

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('../document/csv/IMDB.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One one of the other reviewers has mentioned t...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df['review'] = df['review'].str.lower()

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Remove HTML Tags

In [8]:
import re

In [9]:
def remove_html_tags(text):
    pattern  = re.compile('<.*?>')
    return pattern.sub('',text)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [11]:
df['review'] = df['review'].apply(remove_html_tags)

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Remove URLs

In [13]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)


In [14]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [15]:
df['review'] = df['review'].apply(remove_url)

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Remove Punctuation

In [17]:
import string

In [18]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
# def remove_punctuation(text):
#     for char in exclude:
#         text = text.replace(char,'')
#     return text
# Optimal approach
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))

In [20]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [21]:
df['review'] = df['review'].apply(remove_punctuation)

In [22]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# Chat Word Treatment

In [23]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [24]:
def chat_conversion(text):
    new_text = []
    for word in text.split():
        if(word.upper() in chat_words):
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [25]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [26]:
df['review'] = df['review'].apply(chat_conversion)

In [27]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend Te...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the Tears in my eyes of...,positive


# Spelling Correction

In [28]:
from autocorrect import Speller

In [29]:
spell = Speller(lang='en')

In [30]:
def spell_correction(text):
    return spell(text)

In [31]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend Te...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the Tears in my eyes of...,positive


In [32]:
df['review'] = df['review'].apply(spell_correction)

In [33]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend Te...,positive
3,basically theres a family where a little boy j...,negative
4,better matters love in the Tears in my eyes of...,positive


# Removing stop words

In [34]:
import nltk
from nltk.corpus import stopwords

In [35]:
# nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [36]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if(word in stopwords):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [37]:
df.head()

Unnamed: 0,review,sentiment
0,one one of the other reviewers has mentioned t...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend Te...,positive
3,basically theres a family where a little boy j...,negative
4,better matters love in the Tears in my eyes of...,positive


In [38]:
df['review'] = df['review'].apply(remove_stopwords)

In [39]:
df.head()

Unnamed: 0,review,sentiment
0,one one reviewers mentioned watching 1 ...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend Tears eyes ...,positive
3,basically theres family little boy jake thi...,negative
4,better matters love Tears eyes money vi...,positive


# Handleing Emojis

In [40]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [41]:
df.head()

Unnamed: 0,review,sentiment
0,one one reviewers mentioned watching 1 ...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend Tears eyes ...,positive
3,basically theres family little boy jake thi...,negative
4,better matters love Tears eyes money vi...,positive


In [42]:
df['review'] = df['review'].apply(remove_emojis)

In [43]:
df.head()

Unnamed: 0,review,sentiment
0,one one reviewers mentioned watching 1 ...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend Tears eyes ...,positive
3,basically theres family little boy jake thi...,negative
4,better matters love Tears eyes money vi...,positive


# Stemming

In [44]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [45]:
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [46]:
df.head()

Unnamed: 0,review,sentiment
0,one one reviewers mentioned watching 1 ...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend Tears eyes ...,positive
3,basically theres family little boy jake thi...,negative
4,better matters love Tears eyes money vi...,positive


In [47]:
df['review'] = df['review'].apply(stem_words)

In [48]:
df.head()

Unnamed: 0,review,sentiment
0,one one review mention watch 1 oz episod hook ...,positive
1,wonder littl product film techniqu assum oldti...,positive
2,thought wonder way spend tear eye hot summer w...,positive
3,basic there famili littl boy jake think there ...,negative
4,better matter love tear eye money visual stun ...,positive


# Tokenization

In [49]:
from nltk.tokenize import word_tokenize,sent_tokenize
# import spacy

In [50]:
# nltk.download('punkt')
# tokenizer = spacy.load('en_core_web_sm')

In [51]:
df.head()

Unnamed: 0,review,sentiment
0,one one review mention watch 1 oz episod hook ...,positive
1,wonder littl product film techniqu assum oldti...,positive
2,thought wonder way spend tear eye hot summer w...,positive
3,basic there famili littl boy jake think there ...,negative
4,better matter love tear eye money visual stun ...,positive


In [52]:
df['review'] = df['review'].apply(word_tokenize)

# def word_tokenize(text):
#     for word in text:
#         print(word)
    

In [53]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, one, review, mention, watch, 1, oz, epis...",positive
1,"[wonder, littl, product, film, techniqu, assum...",positive
2,"[thought, wonder, way, spend, tear, eye, hot, ...",positive
3,"[basic, there, famili, littl, boy, jake, think...",negative
4,"[better, matter, love, tear, eye, money, visua...",positive
