In [1]:
import kagglehub
import pandas as pd
import os

# Download the dataset
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)

# List all files in the downloaded directory
files = os.listdir(path)
print("Files in dataset directory:", files)

# Assuming there's a CSV file, load it into a DataFrame
# Replace 'your_file.csv' with the actual file name if necessary
for file in files:
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(path, file))
        break  # Only load the first CSV file found

print(df.head())  # Display the first few rows of the DataFrame


Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 139MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1
Files in dataset directory: ['IMDB Dataset.csv']
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

## lowercase

In [4]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [5]:
df['review'] = df['review'].str.lower()

In [6]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### removing html tags

In [7]:
import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [8]:
df['review'] = df['review'].apply(remove_html_tags)

In [9]:
df['review'][10]

'phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.at first it was very odd and pretty funny but as the movie progressed i didn\'t find the jokes or oddness funny anymore.its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually i just lost interest.i imagine this film would appeal to a stoner who is currently partaking.for something similar but better try "brother from another planet"'

### removing urls

In [11]:
import re
def remove_url(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'',text)

### punctuation handling


In [13]:
import string, time



In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
exclude = string.punctuation
exclude


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [24]:
# linear time complexity
def remove_pun(text):
  for char in exclude:
    text = text.replace(char, '')
  return text


In [25]:
text = 'string. with, . puncuation?'

In [26]:
remove_pun(text)

'string with  puncuation'

In [29]:
# constant
def remove_punc1(text):
  return text.translate(str.maketrans('', '', exclude))

In [27]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [30]:
remove_punc1(df['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

In [31]:
chat_shortcuts = {
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "DM": "Direct Message",
    "FYI": "For Your Information",
    "GTG": "Got To Go",
    "ICYMI": "In Case You Missed It",
    "IDK": "I Don't Know",
    "IKR": "I Know, Right?",
    "IMO": "In My Opinion",
    "IRL": "In Real Life",
    "JK": "Just Kidding",
    "LMK": "Let Me Know",
    "LOL": "Laugh Out Loud",
    "NP": "No Problem",
    "NSFW": "Not Safe For Work",
    "OMW": "On My Way",
    "OMG": "Oh My God",
    "ROFL": "Rolling On the Floor Laughing"
}

# To print out meanings of shortcuts:
for key, value in chat_shortcuts.items():
    print(f"{key}: {value}")


ASAP: As Soon As Possible
BRB: Be Right Back
BTW: By The Way
DM: Direct Message
FYI: For Your Information
GTG: Got To Go
ICYMI: In Case You Missed It
IDK: I Don't Know
IKR: I Know, Right?
IMO: In My Opinion
IRL: In Real Life
JK: Just Kidding
LMK: Let Me Know
LOL: Laugh Out Loud
NP: No Problem
NSFW: Not Safe For Work
OMW: On My Way
OMG: Oh My God
ROFL: Rolling On the Floor Laughing


In [33]:
def chat_conversation(text):
  new_text = []
  for w in text.split():
    if w.upper() in chat_shortcuts:
      new_text.append(chat_shortcuts[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [34]:
chat_conversation('I know I will be delivering ASAP')

'I know I will be delivering As Soon As Possible'

### Incorrect Text Handling

In [39]:
from textblob import TextBlob


In [40]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

### Stopwords

In [41]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [42]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [43]:
len(stopwords.words('english'))

179

* function to remove stopwords



In [44]:
def remove_stopwords(text):
  new_text = []
  for word in  text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return " ".join(x)

In [46]:
df['review'].apply(remove_stopwords)


In [None]:
import re
def remove_emoji(text):
  emoji_pattern = re.compile("["
                             u"\U0001F600-\U0001F64F"  # emoticons
                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                             u"\U00002702-\U000027B0"
                             u"\U000024C2-\U0001F251"

                             "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)

## Tokenization

## 1. Using the split function

In [47]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [49]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

### Tokenization with the help of Regular Expression

In [50]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [51]:
text = """" Lorem Ipsum is simply dummy text of the printing and typesetting industry? Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book
Lorem Ipsum is simply dummy text of the printing and typesetting industry? Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book
"""
sentences = re.compile('[.!?] ').split(text)
sentences

['" Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book\nLorem Ipsum is simply dummy text of the printing and typesetting industry",
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book\n"]

## Tokenization with NLTK

In [52]:
from  nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Tokenization with Space

In [53]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [54]:
doc1 = nlp(text)

In [55]:
print(doc1)

" Lorem Ipsum is simply dummy text of the printing and typesetting industry? Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book
Lorem Ipsum is simply dummy text of the printing and typesetting industry? Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book



## Stemmmer

In [56]:
from nltk.stem.porter import PorterStemmer

In [57]:
ps = PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [58]:
sample  = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [59]:
text = 'I hope that, when I have built up my savings, I will be able to travel to Hawai.'
print(text)

I hope that, when I have built up my savings, I will be able to travel to Hawai.


In [60]:
stem_words(text)

'i hope that, when i have built up my savings, i will be abl to travel to hawai.'

# Lemmatization

In [61]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has abad habit of swimming after playing long hours in the park "
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
  if word in punctuations:
    sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
  print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
abad                abad                
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
park                park                
