In [1]:
import pandas as pd

In [2]:
pwd

'/content'

In [3]:
data_path = '/content/IMDB Dataset.csv'

In [5]:
df = pd.read_csv(data_path)

In [7]:
df.head(100)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
95,Daniel Day-Lewis is the most versatile actor a...,positive
96,My guess would be this was originally going to...,negative
97,"Well, I like to watch bad horror B-Movies, cau...",negative
98,"This IS the worst movie I have ever seen, as w...",negative


In [8]:
df['review'] = df['review'].str.lower()

# **Remove HTML Tags**

In [10]:
import re
def remove_html_tags(text):
  tags = re.compile('<.*?>')
  return tags.sub(r'', text)

In [11]:
remove_html_tags('A wonderful little production. <br /><br />The...	')

'A wonderful little production. The...\t'

In [12]:
df['review'] = df['review'].apply(remove_html_tags)

In [13]:
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

# **Remove URLs**

In [14]:
def remove_url(url):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'', url)

In [15]:
remove_url('hi there https://google.com how are you? Hope you did no tsee any url!')

'hi there  how are you? Hope you did no tsee any url!'

In [17]:
df['review'] = df['review'].apply(remove_url)

In [18]:
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

# **Punctuation Handling**

In [19]:
import string,time
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [20]:
exclude = string.punctuation

In [24]:
def remove_punctuation(text):
  return text.translate(str.maketrans('','',exclude))

In [25]:
remove_punctuation("sdf? .,,1!")

'sdf 1'

In [29]:
df['review'] = df['review'].apply(remove_punctuation)

In [38]:
df['review'][5]

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

# **Chat Conversion Handle**

In [34]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "FYI": "For Your Information",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action"
}

In [35]:
def chat_conversion(text):
  new_text=[]
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)


In [37]:
chat_conversion("I will brb if you have any faq tell me")

'I will Be Right Back if you have any Frequently Asked Questions tell me'

# **Handling possible typos in dataset**

In [43]:
from textblob import TextBlob

In [48]:
text = "hi pleaze list thr conditons that is xpectd"
text_blob = TextBlob(text)
text_blob.correct().string

'hi please list the conditions that is expected'

# **Stopwords**

In [49]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [55]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [64]:
def stop_words(text):
  new_text=[]
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return " ".join(x)


In [67]:
stop_words(df['review'][5])

'probably  alltime favorite movie  story  selflessness sacrifice  dedication   noble cause    preachy  boring   never gets old despite   seen   15   times   last 25 years paul lukas performance brings tears   eyes  bette davis  one     truly sympathetic roles   delight  kids   grandma says  like dressedup midgets  children    makes   fun  watch   mothers slow awakening  whats happening   world     roof  believable  startling     dozen thumbs theyd      movie'