Text Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import pandas as pd
import os


In [3]:
df = pd.read_csv("/workspaces/Data-Scientist/NLP TP/.kaggle/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['review'] = df['review'].str.lower()
df


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


**Remove HTML Tags**

In [11]:
import re 
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [12]:
df['review'].apply(remove_html_tags)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

**Remove URLs**

In [14]:
def remove_url(text):
    pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    return pattern.sub(r'', text)

**Remove Punctuation**

In [15]:
import string, time
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
exclude = string.punctuation

In [18]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', exclude))

In [19]:
text = "Hello! How are you doing?"
remove_punctuation(text)

'Hello How are you doing'

**Spelling correction**

In [24]:
from textblob import TextBlob
incorrect_text = "I havv goood speling!"
corrected_text = TextBlob(incorrect_text).correct()
print(corrected_text)


I have good spelling!


**Remove stop words**

In [27]:
import nltk

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [35]:
def remove_stopwords(text):
    text_list = []
    for word in text.split():
        if word  in  stopwords.words('english'):
            text_list.append('')
        else:
            text_list.append(word)
    x = text_list[:]
    text_list.clear()
    return ' '.join(x)



In [40]:
text = " the  and  a if are stopwords, computer is not"
remove_stopwords(text)

'     stopwords, computer  '

**Handling Emojis**

In [41]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [42]:
text = "I am having a 🎉  I love you ❤️"
remove_emojis(text)

'I am having a   I love you '

In [48]:
#convert emoji to text
import emoji
print (emoji.demojize('Python is ❤️'))

Python is :red_heart:


**Tokenization**

In [49]:
#split function
sentence_1 = "I am ahmed nazar"
sentence_1.split()

['I', 'am', 'ahmed', 'nazar']

In [50]:
#Regular expression

import re
text = "The rain in Spain!"
token = re.findall(r'\b\w+\b', text)
print(token)


['The', 'rain', 'in', 'Spain']


In [55]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#NLTK

from nltk.tokenize import word_tokenize, sent_tokenize
text1 = 'I am ahmed nazar'
word_tokenize(text1)

In [None]:
#spacy

    

**Stemming**

In [68]:
#stemming is process of reducing a word to its root forms 
# such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.
#porterstemmer is a stemming algorithm used for English language
from nltk.stem import PorterStemmer
ps = PorterStemmer()



In [69]:
def stem_words(text):
    return ' '.join([ps.stem(word) for word in text.split()])

In [73]:
sample = "running runs run "
stem_words(sample)

'run run run'

**Lemmatization**

In [78]:
#lemmatization is the process of reducing a word to its base or root form.
#root word is called lemma
#lemmatization is more sophisticated than stemming
#lemmatization considers the context and converts the word to its meaningful base form
#wordnet is a lexical database for the English language

import nltk
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"

sentence_words = nltk.word_tokenize(sentence)
sentence_words = [word for word in sentence_words if word not in punctuations]

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, lemmatizer.lemmatize(word, pos='v')))



[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/codespace/nltk_data'
    - '/home/codespace/.python/current/nltk_data'
    - '/home/codespace/.python/current/share/nltk_data'
    - '/home/codespace/.python/current/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
