In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")

In [3]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


Understanding

1. tokenization
2. lowercase
3. upercase 
4. emojis 
5. pantualation
6. html - urls
7. Stopwatch 
8. Stemming and Lemmatization
9. Spelling correction
11. Whitespace

In [4]:
data['review'][3].upper()

"BASICALLY THERE'S A FAMILY WHERE A LITTLE BOY (JAKE) THINKS THERE'S A ZOMBIE IN HIS CLOSET & HIS PARENTS ARE FIGHTING ALL THE TIME.<BR /><BR />THIS MOVIE IS SLOWER THAN A SOAP OPERA... AND SUDDENLY, JAKE DECIDES TO BECOME RAMBO AND KILL THE ZOMBIE.<BR /><BR />OK, FIRST OF ALL WHEN YOU'RE GOING TO MAKE A FILM YOU MUST DECIDE IF ITS A THRILLER OR A DRAMA! AS A DRAMA THE MOVIE IS WATCHABLE. PARENTS ARE DIVORCING & ARGUING LIKE IN REAL LIFE. AND THEN WE HAVE JAKE WITH HIS CLOSET WHICH TOTALLY RUINS ALL THE FILM! I EXPECTED TO SEE A BOOGEYMAN SIMILAR MOVIE, AND INSTEAD I WATCHED A DRAMA WITH SOME MEANINGLESS THRILLER SPOTS.<BR /><BR />3 OUT OF 10 JUST FOR THE WELL PLAYING PARENTS & DESCENT DIALOGS. AS FOR THE SHOTS WITH JAKE: JUST IGNORE THEM."

In [5]:
data['review'] = data['review'].str.lower()

In [6]:
data

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


### 2. tackle html tags and url

In [7]:
hlmt_sentence = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Welcome to My Website</title><style>body{font-family:'Arial',sans-serif;background-color:#f0f0f0;color:#333;margin:20px}h1{color:#007bff}p{line-height:1.5}</style></head><body><header><h1>Welcome to My Awesome Website!</h1></header><main><p>This is a sample HTML document created for demonstration purposes.</p><p>Feel free to explore and enjoy the content on this website.</p></main><footer><p>&copy; 2024 My Website. All rights reserved.</p></footer></body></html>"""

In [8]:
import re 

In [9]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>') 
    return pattern.sub("", text)

In [10]:
remove_html_tags(hlmt_sentence)

"Welcome to My Websitebody{font-family:'Arial',sans-serif;background-color:#f0f0f0;color:#333;margin:20px}h1{color:#007bff}p{line-height:1.5}Welcome to My Awesome Website!This is a sample HTML document created for demonstration purposes.Feel free to explore and enjoy the content on this website.&copy; 2024 My Website. All rights reserved."

In [11]:
data['review']= data['review'].apply(remove_html_tags)

In [12]:
data['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [13]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub("", text)

In [14]:
data['review']= data['review'].apply(remove_url)

In [15]:
data['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

### Handling Punctuation

In [16]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char, "")
    return text
    

In [18]:
text = remove_punc("string @ *() with punctuaion.!")
text

'string   with punctuaion'

In [19]:
data['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [20]:
data["review"]=data["review"].apply(remove_punc)
data["review"]

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [21]:
text1="FYI this is not true"
text2="LAMO the class was so funny"
text3="I want it ASAP"

In [22]:
chat_words={
"AFAIK":"As Far As I Know",
"AFK": "Away From Keyboard",
"ASAP":"As Soon As Possible",
"BTW":"By The Way",
"B4":"Before",
"LAMO":"Laugh My A.. Off",
"FYI":"For your information"    
}

In [23]:
chat_words["ASAP"]


'As Soon As Possible'

In [24]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [25]:
chat_conversion(text1)



'For your information this is not true'

### Stop words Removal

In [26]:
from nltk.corpus import stopwords

In [27]:
def remove_stop_words(text):
    new_text = []
    for words in text.split():
        if words in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(words.strip()) ## Removing unneccessary white space 
    return " ".join(new_text)

In [28]:
text = "Hi I am Shubham Prajapti and I am learning text-preprocessing. Now tell me What Shubham is learning?"

In [29]:
remove_stop_words(text)

'Hi I  Shubham Prajapti  I  learning text-preprocessing. Now tell  What Shubham  learning?'

### Text Preprocessing with TextBlob

In [30]:
text="this is my processing notebook pleae download this ntebook"
from textblob import TextBlob
textblob=TextBlob(text)
textblob.correct().string


'this is my processing notebook please download this notebook'

In [None]:
text2="Here is my name that is Shubham and he is good leaner"
textblob=TextBlob(text2)
textblob.correct().string

'here is my me that is Shubham and he is good leader'

In [32]:
text3="I'm brave ad stong prson"
textblob=TextBlob(text3)
textblob.correct().string

"I'm brave ad strong person"

### Emojis removal

In [33]:
emojis_text = "Hey! 😊 How’s your day going? 🌞 I’ve been feeling a little 😔 lately, but I’m trying to stay positive 💪. Life can be tough sometimes, but we gotta keep pushing forward 💫. I’m looking forward to the weekend 🌈, hopefully some fun and relaxation 🛋️. What about you? Any exciting plans? 💭 Let me know, I'd love to hear about them! 🌟"

In [34]:
import emoji
emoji.demojize(emojis_text)

"Hey! :smiling_face_with_smiling_eyes: How’s your day going? :sun_with_face: I’ve been feeling a little :pensive_face: lately, but I’m trying to stay positive :flexed_biceps:. Life can be tough sometimes, but we gotta keep pushing forward :dizzy:. I’m looking forward to the weekend :rainbow:, hopefully some fun and relaxation :couch_and_lamp:. What about you? Any exciting plans? :thought_balloon: Let me know, I'd love to hear about them! :glowing_star:"

In [37]:
def remove_emoji(text):
    clean_text=emoji.demojize(text)
    return clean_text

In [38]:
text="""Hello, 😃💁😃💁 People
•🐻🌻 Animals
•🍔🍹 Food
•🎷⚽ Activities
•🚘🌇 Travel
•💡🎉 Objects
•💖🔣 Symbols
•🎌🏳️‍🌈 Flags"""

In [39]:
print(remove_emoji(text))

Hello, :grinning_face_with_big_eyes::person_tipping_hand::grinning_face_with_big_eyes::person_tipping_hand: People
•:bear::sunflower: Animals
•:hamburger::tropical_drink: Food
•:saxophone::soccer_ball: Activities
•:oncoming_automobile::sunset: Travel
•:light_bulb::party_popper: Objects
•:sparkling_heart::input_symbols: Symbols
•:crossed_flags::rainbow_flag: Flags


In [40]:
emoji.is_emoji("thumbs up")

False

In [None]:
emoji.is_emoji("👍")

True

### Tokenization

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
text="My name is Shubham and I am learning text-preprocessing"
word_tokenize(text)

['My',
 'name',
 'is',
 'Shubham',
 'and',
 'I',
 'am',
 'learning',
 'text-preprocessing']

In [None]:
sentence = "ChatGPT is an advanced AI language model developed by OpenAI that uses deep learning to generate human-like text based on input. It is designed to assist with a wide range of tasks, such as answering questions, providing explanations, and creating content in various formats. With its ability to understand and generate natural language, ChatGPT has been integrated into applications like customer service, education, and entertainment. The model is built on GPT (Generative Pre-trained Transformer) architecture, which enables it to learn from vast amounts of text data to improve its performance. While ChatGPT is powerful and versatile, it still has limitations, including occasional inaccuracies and biases in its responses."

sent_tokenize(sentence) 

['ChatGPT is an advanced AI language model developed by OpenAI that uses deep learning to generate human-like text based on input.',
 'It is designed to assist with a wide range of tasks, such as answering questions, providing explanations, and creating content in various formats.',
 'With its ability to understand and generate natural language, ChatGPT has been integrated into applications like customer service, education, and entertainment.',
 'The model is built on GPT (Generative Pre-trained Transformer) architecture, which enables it to learn from vast amounts of text data to improve its performance.',
 'While ChatGPT is powerful and versatile, it still has limitations, including occasional inaccuracies and biases in its responses.']