In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [51]:
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [53]:
df['review'][0]
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [54]:
# lower case
df['review'] = df['review'].str.lower()

In [55]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# Remove HTML Tags

In [56]:
import re
def remove_html_tags(text):
    pattern = re.compile(r'<.*?>')
    return pattern.sub(r'', text)


In [57]:
text = "<html><body><h1>This is a heading</h1><p>This is a paragraph.</p></body></html>"

In [58]:
remove_html_tags(text)


'This is a headingThis is a paragraph.'

In [59]:
df['review'] = df['review'].apply(remove_html_tags)
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

# REMOVE URL

In [60]:
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [61]:
df['review'] = df['review'].apply(remove_url)

df['review'][1]


'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [62]:
text1 = "Check out this link: https://www.example.com and also visit http://example.org for more info."
remove_url(text1) 



'Check out this link:  and also visit  for more info.'

# PUNCTUATION HANDLING

In [63]:
import string, time
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [64]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [65]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

df['review'] = df['review'].apply(remove_punc)
df['review'][1]


'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

# Chat_conversion_handle

In [66]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible'
}


{
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

{'FYI': 'For Your Information',
 'ASAP': 'As Soon As Possible',
 'BRB': 'Be Right Back',
 'BTW': 'By The Way',
 'OMG': 'Oh My God',
 'IMO': 'In My Opinion',
 'LOL': 'Laugh Out Loud',
 'TTYL': 'Talk To You Later',
 'GTG': 'Got To Go',
 'TTYT': 'Talk To You Tomorrow',
 'IDK': "I Don't Know",
 'TMI': 'Too Much Information',
 'IMHO': 'In My Humble Opinion',
 'ICYMI': 'In Case You Missed It',
 'AFAIK': 'As Far As I Know',
 'FAQ': 'Frequently Asked Questions',
 'TGIF': "Thank God It's Friday",
 'FYA': 'For Your Action'}

In [67]:
def chat_conversion(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return ' '.join(new_text)

In [68]:
chat_conversion("AFAIK we should meet ASAP")

'As Far As I Know we should meet As Soon As Possible'

# Incorrect Text Handling

In [69]:
from textblob import TextBlob



In [70]:
textblb = TextBlob("This is a smaple text with som misspelled wrds.")
corrected_text = textblb.correct()
corrected_text

TextBlob("His is a sample text with so misspelled words.")

# Stopwords


In [71]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\satya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [73]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

# Apply efficiently:
df['review'] = df['review'].apply(remove_stopwords)

In [74]:
remove_stopwords("This is a sample text with some common words abracadabra")

'This sample text common words abracadabra'

In [75]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [76]:
df['review'] = df['review'].apply(remove_stopwords)


In [77]:
df['review'][0]

'one reviewers mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle 

# Emojis Handling

In [78]:
import re 
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', text)

In [79]:
df['review'] = df['review'].apply(remove_emojis)

In [80]:
text = "I love programming! 👀👀👀🔥🌊🗿🗣️😭😄🚀 #coding"
remove_emojis(text)

'I love programming!  #coding'

In [81]:
import emoji
emoji.demojize(text)
#OUTPUT:

'I love programming! :eyes::eyes::eyes::fire::water_wave::moai::speaking_head::loudly_crying_face::grinning_face_with_smiling_eyes::rocket: #coding'

# Tokenization

In [82]:
text1 = "i am going to delhi"
text1.split()

['i', 'am', 'going', 'to', 'delhi']

In [83]:
# sentence tokenization
sentence = "Hello there How are you doing today. I hope you're having a great day. AHAHAHAHA EAT SHIT"
sentence.split('.')

['Hello there How are you doing today',
 " I hope you're having a great day",
 ' AHAHAHAHA EAT SHIT']

## NLTK

In [84]:
import nltk

In [85]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\satya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [86]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [87]:
import os
print(os.listdir(r"C:\Users\satya\AppData\Roaming\nltk_data"))


['corpora', 'tokenizers']


In [88]:
import os
print(os.listdir(r"C:\Users\satya\AppData\Roaming\nltk_data\tokenizers"))


['punkt', 'punkt.zip']


In [89]:
print(os.listdir(r"C:\Users\satya\AppData\Roaming\nltk_data\tokenizers\punkt"))


['.DS_Store', 'czech.pickle', 'danish.pickle', 'dutch.pickle', 'english.pickle', 'estonian.pickle', 'finnish.pickle', 'french.pickle', 'german.pickle', 'greek.pickle', 'italian.pickle', 'malayalam.pickle', 'norwegian.pickle', 'polish.pickle', 'portuguese.pickle', 'PY3', 'README', 'russian.pickle', 'slovene.pickle', 'spanish.pickle', 'swedish.pickle', 'turkish.pickle']


In [90]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
import pickle

# Directly load the tokenizer object
with open(r"C:\Users\satya\AppData\Roaming\nltk_data\tokenizers\punkt\english.pickle", "rb") as f:
    tokenizer = pickle.load(f)   # <-- this is already a PunktSentenceTokenizer

sent1 = "Hello there How are you doing today. I hope you're having a great day. AHAHAHAHA EAT SHIT"
print(tokenizer.tokenize(sent1))


['Hello there How are you doing today.', "I hope you're having a great day.", 'AHAHAHAHA EAT SHIT']


In [91]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)   


Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN nsubj
startup VERB ccomp
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [92]:
import transformers
print(transformers.__version__)


4.44.2


In [93]:
from transformers import AutoTokenizer




In [94]:
from transformers import AutoTokenizer #ignore warning vsc on drugs 🗿

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer.tokenize("Elon Musk founded SpaceX in 2002."))


['el', '##on', 'mu', '##sk', 'founded', 'space', '##x', 'in', '2002', '.']


