In [3]:
import pandas as pd

In [4]:

!pip install pandas



In [5]:
data=pd.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")

In [6]:
data.head() #prints first 5 rows of the dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data.tail() #prints last 5 rows of the dataset

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [8]:
data.shape #prints the number of rows and columns in the dataset

(50000, 2)

In [10]:
data.sample(5) #prints 5 random rows from the dataset

Unnamed: 0,review,sentiment
30676,Bette Davis turns in a coldly amusing performa...,positive
12104,"From the beginning of this film,with it's ""The...",negative
44261,"At first glance a film like Northfork, a town ...",positive
22318,Carl Panzram lived an amazing life and scribbl...,negative
29394,The movie was much better than the other revie...,positive


In [11]:
data["review"][0] #prints the review of the first row

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Data Preprocessing Steps

1. **Tokenization**: Splitting the text into individual words or tokens. This helps in analyzing the text at a granular level.
2. **Lowercase**: Converting all text to lowercase to ensure uniformity and avoid case-sensitive mismatches.
3. **Uppercase**: Converting text to uppercase if required for specific analysis.
4. **Emojis**: Handling emojis by either removing them or converting them to text representations.
5. **Punctuation**: Removing punctuation marks to clean the text and focus on the words.
6. **HTML**: Stripping HTML tags from the text to avoid irrelevant content.
7. **URLs**: Removing URLs to clean the text from unnecessary links.
8. **Stopwords**: Removing common words (e.g., "and", "the") that do not contribute much to the meaning of the text.
9. **Abbreviations or Slang**: Expanding abbreviations and converting slang to their standard forms for better understanding.
10. **Stemming and Lemmatization**: Reducing words to their root forms to standardize different variations of the same word.
11. **Spelling Correction**: Correcting spelling errors to improve text quality.
12. **Whitespaces**: Removing extra whitespaces to clean the text and ensure consistency.

In [11]:
# Data Preprocessing (Text Cleaning)

In [12]:
# Lowercasing the text
data["review"][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [13]:
# Uppercasing the text
data["review"][3].upper()

"BASICALLY THERE'S A FAMILY WHERE A LITTLE BOY (JAKE) THINKS THERE'S A ZOMBIE IN HIS CLOSET & HIS PARENTS ARE FIGHTING ALL THE TIME.<BR /><BR />THIS MOVIE IS SLOWER THAN A SOAP OPERA... AND SUDDENLY, JAKE DECIDES TO BECOME RAMBO AND KILL THE ZOMBIE.<BR /><BR />OK, FIRST OF ALL WHEN YOU'RE GOING TO MAKE A FILM YOU MUST DECIDE IF ITS A THRILLER OR A DRAMA! AS A DRAMA THE MOVIE IS WATCHABLE. PARENTS ARE DIVORCING & ARGUING LIKE IN REAL LIFE. AND THEN WE HAVE JAKE WITH HIS CLOSET WHICH TOTALLY RUINS ALL THE FILM! I EXPECTED TO SEE A BOOGEYMAN SIMILAR MOVIE, AND INSTEAD I WATCHED A DRAMA WITH SOME MEANINGLESS THRILLER SPOTS.<BR /><BR />3 OUT OF 10 JUST FOR THE WELL PLAYING PARENTS & DESCENT DIALOGS. AS FOR THE SHOTS WITH JAKE: JUST IGNORE THEM."

In [None]:
# Why and what is text preprocessing? 
'''
Text preprocessing is the process of cleaning and preparing text data. It is important to clean the text data before feeding it to machine learning algorithms. Text preprocessing is done to remove unwanted symbols, characters, and other noise from the text data. It helps in improving the performance of machine learning models. 

Text preprocessing includes the following steps:
1. Lowercasing the text
2. Removing special characters
3. Removing stopwords
4. Tokenization
5. Lemmatization
6. Stemming
7. Removing HTML tags
8. Removing URLs
9. Removing emojis
10. Removing numbers
11. Removing extra whitespaces
12. Removing accented characters
13. Removing email addresses


These days LLM Models (GPT, Claudia, etc) - they have the capability to do text preprocessing on their own. So data preprocessing is not required for them but for traditional machine learning models, it is required. Sometimes we need to do text preprocessing for LLM models as well especially when we are working with a small dataset or hallucination is happening and the data is too messy.

'''


In [14]:
# Lowercasing the text in the review column
data["review"]=data["review"].str.lower() #Why use str.lower() instead of lower()? Because lower() is a string method and str.lower() is a pandas method.

In [15]:
data["review"]

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

#### Removing HTML and URLs from the text (Using Regular Expressions)

In [16]:
text = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Welcome to My Website</title><style>body{font-family:'Arial',sans-serif;background-color:#f0f0f0;color:#333;margin:20px}h1{color:#007bff}p{line-height:1.5}</style></head><body><header><h1>Welcome to My Awesome Website!</h1></header><main><p>This is a sample HTML document created for demonstration purposes.</p><p>Feel free to explore and enjoy the content on this website.</p></main><footer><p>&copy; 2024 My Website. All rights reserved.</p></footer></body></html>"""


In [17]:
import re

# Function to remove HTML tags from the text
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('', text) #sub() is a method that replaces the matched pattern with the specified text. In this case, we are replacing the HTML tags with an empty string.

In [18]:
# Driver Function 
remove_html_tags(text)

"Welcome to My Websitebody{font-family:'Arial',sans-serif;background-color:#f0f0f0;color:#333;margin:20px}h1{color:#007bff}p{line-height:1.5}Welcome to My Awesome Website!This is a sample HTML document created for demonstration purposes.Feel free to explore and enjoy the content on this website.&copy; 2024 My Website. All rights reserved."

In [19]:
# Applying the remove_html_tags function to the review column
data["review"]=data["review"].apply(remove_html_tags)

# Printing the first 5 rows of the dataset after removing HTML tags
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [20]:
# Removing the URLs from the text

# Sample text with URLs
text = "Check out this amazing website: https://www.example.com"

# Function to remove URLs from the text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+') #Regular expression pattern to match URLs starting with http, https, or www.
    return url_pattern.sub("", text)


In [21]:
# Driver Code
remove_urls(text)

'Check out this amazing website: '

In [24]:
# Note: We can remove the URL using the regex

##To handle punctuation

In [1]:
import string

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
exclude=string.punctuation

In [4]:
for char in exclude:
    print(char)
    

!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~


In [9]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, "")
    return text

In [None]:

    for char in exclude:
        text = text.replace(char, "")
    return text

In [10]:
text="string @ *() with punctuaion.!"

In [11]:
remove_punc(text)

'string   with punctuaion'

In [12]:
text="string @ *() with punctuaion.!"

In [None]:
remove_punc(text2)

In [13]:
text2="my name is su@@@nn$$$&&%%y"

In [14]:
remove_punc(text2)

'my name is sunny'

In [None]:

data["review"]=data["review"].apply(remove_punc)

NameError: name 'data' is not defined

In [None]:
data["review"]

In [16]:
text1="FYI this is not true"
text2="LAMO the class was so funny"
text3="I want it ASAP"

In [17]:
chat_words={
"AFAIK":"As Far As I Know",
"AFK": "Away From Keyboard",
"ASAP":"As Soon As Possible",
"BTW":"By The Way",
"B4":"Before",
"LAMO":"Laugh My A.. Off",
"FYI":"For your information"    
}

In [18]:
chat_words["ASAP"]

'As Soon As Possible'

In [21]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [22]:
text1

'FYI this is not true'

In [23]:
" ".join(["sunny","savita","data scientist","ML engineer"])

'sunny savita data scientist ML engineer'

In [24]:
chat_conversion(text1)

'For your information this is not true'

In [25]:
text2

'LAMO the class was so funny'

In [26]:
chat_conversion(text2)

'Laugh My A.. Off the class was so funny'

In [27]:
text3

'I want it ASAP'

In [28]:
chat_conversion(text3)

'I want it As Soon As Possible'

##SPACY

In [29]:
from textblob import TextBlob

In [34]:
text="I'm brav and srong person"

In [35]:
textblob=TextBlob(text)

In [36]:
textblob.correct().string

"I'm brave and strong person"

In [37]:
import nltk

In [38]:
from nltk.corpus import stopwords

In [39]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prade\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [46]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [57]:
def remove_stop_words(text):
    new_text=[]
    for words in text.split():
        if words in stopwords.words("english"):
            new_text.append("")
        else:
            new_text.append(words.strip())
    return " ".join(new_text).replace("  "," ")       
    

In [60]:
text="Hi i m pradeep gore and i am working as a data sciensitst and genai engineer. now tell me who is pradeep?"

In [61]:
remove_stop_words(text)

'Hi  pradeep gore  working  data sciensitst genai engineer. tell  pradeep?'

In [62]:
original_text = "Hello,😊 how are you today? 🌟"

In [63]:
import emoji

In [64]:
emoji.demojize(original_text)

'Hello,:smiling_face_with_smiling_eyes: how are you today? :glowing_star:'

In [65]:
def remove_emoji(text):
    clean_text=emoji.demojize(text)
    return clean_text

In [66]:
text="""Hello, 😃💁😃💁 People
•🐻🌻 Animals
•🍔🍹 Food
•🎷⚽ Activities
•🚘🌇 Travel
•💡🎉 Objects
•💖🔣 Symbols
•🎌🏳️‍🌈 Flags"""

In [67]:
print(remove_emoji(text))

Hello, :grinning_face_with_big_eyes::person_tipping_hand::grinning_face_with_big_eyes::person_tipping_hand: People
•:bear::sunflower: Animals
•:hamburger::tropical_drink: Food
•:saxophone::soccer_ball: Activities
•:oncoming_automobile::sunset: Travel
•:light_bulb::party_popper: Objects
•:sparkling_heart::input_symbols: Symbols
•:crossed_flags::rainbow_flag: Flags


In [68]:
emoji.is_emoji("thumbs up")

False

In [69]:
emoji.is_emoji("👍")

True

##Tokenization

In [70]:
text = "i'm pradeeep gore and working as data scientist"

In [71]:
text.split()

["i'm", 'pradeeep', 'gore', 'and', 'working', 'as', 'data', 'scientist']

In [73]:
text.split(".")

["i'm pradeeep gore and working as data scientist"]

In [74]:
text = "i'm pradeeep gore and working as data scientist.I live in banglore and workin into multiple domains"

In [75]:
text.split(".")

["i'm pradeeep gore and working as data scientist",
 'I live in banglore and workin into multiple domains']

### can i perform word tokenization using regex

In [76]:
text="I'm pradeep and live in banglore"

In [77]:
import re
re.findall("[\w]",text)


['I',
 'm',
 'p',
 'r',
 'a',
 'd',
 'e',
 'e',
 'p',
 'a',
 'n',
 'd',
 'l',
 'i',
 'v',
 'e',
 'i',
 'n',
 'b',
 'a',
 'n',
 'g',
 'l',
 'o',
 'r',
 'e']

In [78]:
text="I'm going to visit pune tommorrow morning"

In [79]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [80]:
word_tokenize(text)

['I', "'m", 'going', 'to', 'visit', 'pune', 'tommorrow', 'morning']