In [1]:
import numpy as np 
import pandas as pd

In [2]:
imdb=pd.read_csv(r"D:\DATA SCIENCE\ML\IMDB Dataset.csv")

In [3]:
df=imdb.sample(5000)

In [4]:
df.shape

(5000, 2)

In [5]:
df.head(3)

Unnamed: 0,review,sentiment
44063,Jim Carrey shines in this beautiful movie. Thi...,positive
31441,When one watches the animated Superman shorts ...,positive
389,"""Crossfire"" is a justifiably famous 1947 noir ...",positive


# STEPS

1. Convert to lowercase
2. Remove HTML tags
3. Remove Punctuation
4. Spelling Correction
5. Remove Stopwords
6. Tokenization
7. Stemming/Lemmatization

## 1. Convert to lowercase

In [6]:
df['review']=df['review'].apply(lambda x: x.lower())

In [7]:
df.head()

Unnamed: 0,review,sentiment
44063,jim carrey shines in this beautiful movie. thi...,positive
31441,when one watches the animated superman shorts ...,positive
389,"""crossfire"" is a justifiably famous 1947 noir ...",positive
4290,i found this little gem as an extra feature on...,positive
25996,the wind and the lion is a marvelous sweeping ...,positive


## 2. Remove HTML tags

In [8]:
import re

def remove_html_tags(text):
    clean = re.compile('<.*?>')     # for URL text: 'https?://\S+|www\.\S+'
    return re.sub(clean, '', text)

html_string = '<p>This is <b>some</b> text with <a href="#">HTML</a> tags.</p>'
clean_text = remove_html_tags(html_string)
print(clean_text)


This is some text with HTML tags.


In [9]:
df['review']=df['review'].apply(remove_html_tags)

## 3. Remove Punctuation

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
exclude= string.punctuation

In [12]:
# Slow processing
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,' ')
    return text

In [13]:
# Fast processing
import string

def remove_punc1(text):
    exclude = string.punctuation
    return text.translate(str.maketrans(' ', ' ', exclude))


In [14]:
a='@ansxh ?<> *suhxuss]sdssds'

In [15]:
 remove_punc1(a)

'ansxh  suhxusssdssds'

In [16]:
df['review']=df['review'].apply(remove_punc1)

## 4. Spelling Correction

In [17]:
from textblob import TextBlob

In [18]:
incorrect='certnly i am going to have a bath jst aftr dinnr'
txtblb=TextBlob(incorrect)
txtblb.correct().string

'certainly i am going to have a bath just after dinner'

## 5. Remove Stopwords

In [19]:
from nltk.corpus import stopwords

In [20]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x= new_text[:]
    new_text.clear()
    return ' '.join(x)

In [22]:
df['review']=df['review'].apply(remove_stopwords)

## 6. Tokenization(nltk and spacy)

### nltk

In [24]:
from nltk import word_tokenize,sent_tokenize
# nltk.download('punkt')

In [25]:
sent1='I @m going to vi$iT De!hi'
word_tokenize(sent1)

['I', '@', 'm', 'going', 'to', 'vi', '$', 'iT', 'De', '!', 'hi']

## 7.(i) Stemming
Stemmming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the language.

(e.g. sing,singed,singing convert to sing & play,played,playing convert to play)

In [35]:
# fast process but have set backs
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
ps.stem('singing')

'sing'

In [37]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
def stem_words(text):
    return ' '.join([ps.stem(word) for word in text.split()])

In [38]:
stem_words('I am singing a song while reading story and watching movie')

'i am sing a song while read stori and watch movi'

## 7.(ii) Lemmatization
Lemmatization unlike Stemming reduces the inflected words properly ensuring that the root word belongs to the language.In Lemmatization root word is called Lemma. A lemma is the canonical form, dictionary form, or citation form of a set of words

In [60]:
# slow but throw a valid word  belongs to the language as output
import nltk

In [48]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


True

In [56]:
from nltk import WordNetLemmatizer
wnl=WordNetLemmatizer()
def lemma_words(text):
    return ' '.join([wnl.lemmatize(word,pos='v') for word in text.split()])

In [59]:
lemma_words('I am singing a song while reading story and watching movie')

'I be sing a song while read story and watch movie'

In [61]:
wnl.lemmatize('singing',pos='v')

'sing'