In [None]:
import pandas as pd
import numpy as np
import re, string
import nltk
import emoji
from textblob import TextBlob
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab') # for tokenization
nltk.download('wordnet') # for lemmatization and stemming

In [32]:
data=pd.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")

In [33]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Text Cleaning involves:
1. tokenization
2. lowercase
3. uppercase
4. emojis
5. pancuations
6. html,url
7. stopwords
8. abbravation or slang
9.  steemming and lemmetization
10. spelling correction
11. whitspace

In [None]:
# Lower case upper case
## converting entire review in lower case so that all will be uniform
data['review'] = data['review'].apply(lambda x: x.lower())

In [35]:
## remove HTML 
def removeHTML(text):
  pattern = re.compile("<.*?>")
  return pattern.sub("",text)
data['review'] = data['review'].apply(lambda x: removeHTML(x))

In [36]:
# remove URL https://www.youtube.com/watch?v=V9tJCQoBakA&t=225s
def removeURL(text):
  pattern = re.compile(r'https?:\/\/\S+|www.\S+')
  return pattern.sub("",text)

data['review'] = data['review'].apply(removeURL)

In [37]:
data.head(20)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [38]:
## Puncuation marks
def removePunctuation(text):
  pattern = re.compile(r"[^\w\s]")
  return pattern.sub("",text)

data['review'] = data['review'].apply(removePunctuation)

In [39]:
data.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
5,probably my alltime favorite movie a story of ...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative ide...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [42]:
data['review'].head(20)

0     one reviewers mentioned watching 1 oz episode ...
1     wonderful little production filming technique ...
2     thought wonderful way spend time hot summer we...
3     basically theres family little boy jake thinks...
4     petter matteis love time money visually stunni...
5     probably alltime favorite movie story selfless...
6     sure would like see resurrection dated seahunt...
7     show amazing fresh innovative idea 70s first a...
8     encouraged positive comments film looking forw...
9     like original gut wrenching laughter like movi...
10    phil alien one quirky films humour based aroun...
11    saw movie 12 came recall scariest scene big bi...
12    im big fan bolls work many enjoyed movie posta...
13    cast played shakespeareshakespeare losti appre...
14    fantastic movie three prisoners become famous ...
15    kind drawn erotic scenes realize one amateuris...
16    films simply remade one bad film fails capture...
17    movie made one top 10 awful movies horribl

In [43]:
# remove emojis
data['review'] = data['review'].apply(lambda x: emoji.replace_emoji(x,''))

# # remove emojis (working slow)
# import demoji
# data['review'] = data['review'].apply(lambda x: demoji.replace(x,''))

In [44]:
data.sample(10)

Unnamed: 0,review,sentiment
17218,presenting lily mars real pleasant little film...,positive
47837,disappointing horror film snotty young girl ni...,negative
23148,like festivals entries hamiltons makes interes...,negative
21027,julie andrews plays german spy falls love amer...,positive
2749,1st watched 7192003 1 10dirbrad sykes ridiculo...,negative
35758,movie deviated bible fell bar 1956 movie hate ...,negative
9549,oftentimes films nature come across mixed bag ...,positive
45595,parasomnia interesting premises story poorly d...,negative
14906,look carefully wonderful assortment talent put...,negative
42222,great book poorly done movie cheesy performanc...,negative


In [45]:
## Slang abrevations
slang_dict = {
    "AF": "As F***",
    "ATM": "At The Moment",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "DM": "Direct Message",
    "FOMO": "Fear Of Missing Out",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "IDK": "I Don't Know",
    "IKR": "I Know, Right?",
    "IMO": "In My Opinion",
    "IRL": "In Real Life",
    "JK": "Just Kidding",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laugh Out Loud",
    "NSFW": "Not Safe For Work",
    "OMG": "Oh My God",
    "OMW": "On My Way",
    "ROFL": "Rolling On The Floor Laughing",
    "SMH": "Shaking My Head",
    "TBH": "To Be Honest",
    "TBT": "Throwback Thursday",
    "TMI": "Too Much Information",
    "TTYL": "Talk To You Later",
    "YOLO": "You Only Live Once",
    "WTF": "What The F***",
    "ICYMI": "In Case You Missed It",
    "BFF": "Best Friends Forever",
    "GG": "Good Game",
    "NVM": "Never Mind",
    "SFW": "Safe For Work",
    "IDC": "I Don't Care",
    "RN": "Right Now",
    "TYT": "Take Your Time",
    "ILY": "I Love You",
    "IMHO": "In My Humble Opinion",
    "BRUH": "Brother (Slang for Friend)",
    "ASAP": "As Soon As Possible",
    "CU": "See You",
    "GR8": "Great",
    "HF": "Have Fun",
    "HBD": "Happy Birthday",
    "IDC": "I Don’t Care",
    "LMK": "Let Me Know",
    "NP": "No Problem",
    "PLZ": "Please",
    "PPL": "People",
    "SOB": "Son Of A B****",
    "TBD": "To Be Determined",
    "TY": "Thank You",
    "WYD": "What You Doing?",
    "YO": "Hey (Informal Greeting)"
}


In [49]:
def fill_slang_chats(text):
  res = []
  for w in text.split():
    if w.upper() in slang_dict:
      res.append(slang_dict[w.upper()])
    else:
      res.append(w)
  return " ".join([r.lower() for r in res])

data['review'] = data['review'].apply(fill_slang_chats)

In [48]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


## Few more libs for Text preprocessing such as Spelling correction
- NLTK
- SpaCy
- TextBlob

In [40]:
## removal of stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/praveensrivas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

data['review'] = data['review'].apply(
  lambda x: ' '.join([word.strip() for word in x.split() if word not in stop_words]
  ))

In [56]:
text = 'Hello sir how are yu?'
textblob = TextBlob(text)
textblob.correct().string

'Hello sir how are you?'

## Text tokenization
Tokenization refers to the process of breaking down a text string into smaller units called tokens. These tokens can be words, phrases, or even individual characters, depending on the specific application
We can do tokenization using below ways
- split() function
- regex
- NLTK libs

In [74]:
data.sample(5)

Unnamed: 0,review,sentiment
36430,one best parts sundance seeing movies would ot...,positive
18795,absolutely love standup comedy love hear raw t...,negative
30892,wes craven laugh expense red eye plot preposte...,negative
31565,never realized charles boyer luis denard appea...,positive
7566,disturbing expertly crafted scripted intellige...,negative


In [87]:
## Tokenization
data['token']=data['review'].apply(lambda x: word_tokenize(x))

## Stemming and Lemmetization

### **Lemmatization vs. Stemming**

| Feature            | **Lemmatization**                                              | **Stemming**                                         |
|---------------------|---------------------------------------------------------------|-----------------------------------------------------|
| **Definition**      | Reduces words to their base or root form (lemma), ensuring the word has meaning in the language. | Reduces words to their root form by removing suffixes, often resulting in non-meaningful stems. |
| **Result**          | Produces linguistically correct words (e.g., "better" → "good"). | Produces crude root forms that may not be valid words (e.g., "running" → "run"). |
| **Algorithm Type**  | Relies on a vocabulary and grammar rules to find the base form. | Uses simple heuristic rules for stripping suffixes. |
| **Output Quality**  | More accurate and meaningful.                                | Faster but less accurate.                           |
| **Use Case**        | Ideal for tasks where linguistic correctness is important, like text analysis. | Useful for search engines or quick text preprocessing. |
| **Example**         | "geese" → "goose", "better" → "good".                        | "geese" → "gees", "better" → "better".             |


In [None]:
from nltk.stem import PorterStemmer
'''
It is not much inteligent
'''
def stemming(text):
  obj = PorterStemmer()
  stem_word = [obj.stem(w) for w in text.split()]
  return " ".join(stem_word)

In [84]:
stemming("Hello are u running")

'hello are u run'

In [93]:
## Lemmetization
from nltk.stem import WordNetLemmatizer
def Lemmetization(token):
  obj = WordNetLemmatizer()
  lem_word = [obj.lemmatize(w) for w in token]
  return " ".join(lem_word)

data['lemma']=data['token'].apply(lambda x: Lemmetization(x))


In [94]:
data.head(5)

Unnamed: 0,review,sentiment,token,lemma
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e...",one reviewer mentioned watching 1 oz episode y...
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...",wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...,negative,"[basically, theres, family, little, boy, jake,...",basically there family little boy jake think t...
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...",petter matteis love time money visually stunni...
