In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

In [4]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(50000, 2)

In [6]:
# Checking for Null values
df.columns[df.isna().any()]

Index([], dtype='object')

In [7]:
df['sentiment'].replace({'positive':1, 'negative':0}, inplace=True)

In [8]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Text Cleaning

In [9]:
# Removing HTML Tags

import re
def clean_html(text):
    clean = re.compile("<.*?>")
    return re.sub(clean, '', text)

In [10]:
df['review'] = df['review'].apply(clean_html)

In [11]:
# Converting text to lowercase
def lower_text(text):
    return text.lower()

In [13]:
df['review'] = df['review'].apply(lower_text)

In [14]:
# Removing special characters
def remove_special(text):
    x = ''
    for i in text:
        if i.isalnum():
            x = x + i
        else:
            x += " "
    return x

In [15]:
df['review'] = df['review'].apply(remove_special)

In [17]:
# Removing stopwords
import nltk

In [18]:
from nltk.corpus import stopwords

In [24]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
def remove_stopWords(text):
    x = []
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)

    y = x[:]
    x.clear()
    return y

In [33]:
df['review'] = df['review'].apply(remove_stopWords)

In [34]:
df.head(5)

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1


In [35]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [36]:
def stem_text(text):
    y = []
    for i in text:
        y.append(ps.stem(i))
    z = y[:]
    y.clear()
    return z

In [39]:
df['review'] = df['review'].apply(stem_text)

In [40]:
# converting list to string again
def join_back(list_text):
    return " ".join(list_text)

In [41]:
df['review'] = df['review'].apply(join_back)

In [42]:
df.head(5)

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [121]:
df.to_csv("Preprocessed_text.csv")