In [55]:
# Import necessary libraries
import pandas as pd
import numpy as np

import re

In [56]:
# Load the IMDB dataset from a URL
df = pd.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")

In [57]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [58]:
# Check the shape of the dataframe (number of rows and columns)
df.shape

(50000, 2)

In [59]:
df.iloc[0,0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [60]:
df.iloc[1,0]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [61]:
df.iloc[0,0].lower()

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

### Lowercasing

In [62]:
# Convert all reviews to lowercase
df["review"] = df["review"].str.lower()

In [63]:
# Display the first few rows to verify changes
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove HTML Tags

In [64]:
# Define a function to remove HTML tags using regex
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [65]:
# Apply the HTML tag removal function to all reviews
df["review"] = df["review"].apply(lambda x : remove_html_tags(x))

In [66]:
# Display the first few rows to verify changes
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove URL

In [67]:
# Define a function to remove URLs using regex
def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

In [68]:
# Test the URL removal function with an example string
text1 = "https://www.google.com Hello World"

In [69]:
remove_urls(text1)

' Hello World'

In [70]:
# Apply the URL removal function to all reviews
df["review"] = df["review"].apply(remove_urls)

In [71]:
# Display the first few rows to verify changes
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove Punctuations

In [72]:
import string

# Define a function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [73]:
# Apply the punctuation removal function to all reviews
df["review"] = df["review"].apply(remove_punctuation)

In [74]:
# Display the first few rows to verify changes
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### Chat Word Treatment

In [75]:
# Define a dictionary for chat words and their expansions
chat_words = {
    "FYI": "For Your Information"
}

In [76]:
# Define a function to replace chat words with their full forms
def chat_word_treatment(text):
    new_text=[]
    for word in text.split():
            if word.upper() in chat_words.keys():
                new_text.append(chat_words[word.upper()])
            else:
                new_text.append(word)
    return " ".join(new_text)


In [77]:
# Test the chat word treatment function with an example string
chat_word_treatment("FYI you need to do this again.")

'For Your Information you need to do this again.'

### Spell Correction

In [78]:
from textblob import TextBlob

In [79]:
# Example string with spelling errors
text = "Thiss is myy mistakees"

In [80]:
# Correct spelling using TextBlob
txtblob = TextBlob(text)
txtblob.correct().string

'Hiss is may mistakes'

### Removal Of Stop Words

In [81]:
import nltk
from nltk.corpus import stopwords

In [82]:
# Download the stopwords list
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
# Display the stopwords list
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [84]:
# Define a function to remove stopwords
def remove_stopwords(text):

  new_text = []
  for word in text.split():
    if word not in stopwords.words("english"):
      new_text.append(word)

  return " ".join(new_text)

In [85]:
# Test the stopword removal function with an example string
remove_stopwords("Hi my name is Saurav")

'Hi name Saurav'

### Remove Emoji

In [86]:
! pip install emoji



In [87]:
import emoji

In [88]:
# Define a function to remove emojis by converting them to text descriptions
def remove_emoji(text):
  return emoji.demojize(text)

In [89]:
# Test the emoji removal function with an example string
remove_emoji("😀Hello World😇")

':grinning_face:Hello World:smiling_face_with_halo:'

### Tokenization

In [90]:
# Example string for tokenization
text = "My name is saurav sabu"

In [91]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [92]:
# Download the punkt tokenizer models
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
# Tokenize the string into words
word_tokenize(text)

['My', 'name', 'is', 'saurav', 'sabu']

In [94]:
# Tokenize the string into sentences
sent_tokenize(text)

['My name is saurav sabu']

In [95]:
# Longer example string for tokenization
text = """Generative AI is a type of artificial intelligence technology that can produce various types of content, including text, imagery, audio and synthetic data. The recent buzz around generative AI has been driven by the simplicity of new user interfaces for creating high-quality text, graphics and videos in a matter of seconds.

The technology, it should be noted, is not brand-new. Generative AI was introduced in the 1960s in chatbots. But it was not until 2014, with the introduction of generative adversarial networks, or GANs -- a type of machine learning algorithm -- that generative AI could create convincingly authentic images, videos and audio of real people."""

In [96]:
# Tokenize the longer text into sentences
sent_tokenize(text)

['Generative AI is a type of artificial intelligence technology that can produce various types of content, including text, imagery, audio and synthetic data.',
 'The recent buzz around generative AI has been driven by the simplicity of new user interfaces for creating high-quality text, graphics and videos in a matter of seconds.',
 'The technology, it should be noted, is not brand-new.',
 'Generative AI was introduced in the 1960s in chatbots.',
 'But it was not until 2014, with the introduction of generative adversarial networks, or GANs -- a type of machine learning algorithm -- that generative AI could create convincingly authentic images, videos and audio of real people.']

In [97]:
# Tokenize the longer text into words
word_tokenize(text)

['Generative',
 'AI',
 'is',
 'a',
 'type',
 'of',
 'artificial',
 'intelligence',
 'technology',
 'that',
 'can',
 'produce',
 'various',
 'types',
 'of',
 'content',
 ',',
 'including',
 'text',
 ',',
 'imagery',
 ',',
 'audio',
 'and',
 'synthetic',
 'data',
 '.',
 'The',
 'recent',
 'buzz',
 'around',
 'generative',
 'AI',
 'has',
 'been',
 'driven',
 'by',
 'the',
 'simplicity',
 'of',
 'new',
 'user',
 'interfaces',
 'for',
 'creating',
 'high-quality',
 'text',
 ',',
 'graphics',
 'and',
 'videos',
 'in',
 'a',
 'matter',
 'of',
 'seconds',
 '.',
 'The',
 'technology',
 ',',
 'it',
 'should',
 'be',
 'noted',
 ',',
 'is',
 'not',
 'brand-new',
 '.',
 'Generative',
 'AI',
 'was',
 'introduced',
 'in',
 'the',
 '1960s',
 'in',
 'chatbots',
 '.',
 'But',
 'it',
 'was',
 'not',
 'until',
 '2014',
 ',',
 'with',
 'the',
 'introduction',
 'of',
 'generative',
 'adversarial',
 'networks',
 ',',
 'or',
 'GANs',
 '--',
 'a',
 'type',
 'of',
 'machine',
 'learning',
 'algorithm',
 '--',
 

### Stemming

In [98]:
from nltk.stem import PorterStemmer

In [99]:
# Define a function to apply stemming and remove stopwords
def stemming(text):
  pt = PorterStemmer()
  return " ".join([pt.stem(x) for x in text.split() if x not in stopwords.words("english")])

In [100]:
# Test the stemming function with an example string
stemming("My name is saurav. What are you doing? go gone going")

'my name saurav. what doing? go gone go'

### Lemmatization

In [101]:
from nltk.stem import WordNetLemmatizer

In [102]:
# Download the WordNet corpus
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [103]:
# Define a function to apply lemmatization and remove stopwords
def lemma(text):
  lem = WordNetLemmatizer()
  return " ".join([lem.lemmatize(x) for x in text.split() if x not in stopwords.words("english")])

In [104]:
# Test the lemmatization function with an example string
lemma("My name is saurav. What are you doing? go gone going")

'My name saurav. What doing? go gone going'