# Lowercasing

In [3]:
import pandas as pd
df=pd.read_csv('IMDB review Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [5]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [10]:
df['review']=df['review'].str.lower() #applying .lower() to entire corpus
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Removing HTML Tags

In [11]:
import re #re is a library in python called regular expression
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)


In [17]:
df['review']=df['review'].apply(remove_html_tags)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Tokenization

In [18]:
import string
string.punctuation #This gives the punctuations which are considered in python.
# We can eliminate any punctuation as per our needs.

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
exclude=string.punctuation

In [20]:
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
def remove_punc(text):
    for x in exclude:
        text=text.replace(x,'')
    return text

In [23]:
text='string. With , Punctuation'
text.apply(remove_punc)

AttributeError: 'str' object has no attribute 'apply'

In [24]:
remove_punc(text)

'string With  Punctuation'

In [27]:
import time
start=time.time()
remove_punc(text)
time1=time.time()-start
print(time1)

0.0


In [31]:
import string
exclude=string.punctuation
def remove_punc1(text):                                     #More efficient punctuation removing code.
    return text.translate(str.maketrans('','',exclude))

In [32]:
start=time.time()
remove_punc1(text)
time1=time.time()-start
print(time1)

0.0


In [33]:
remove_punc1(df['review'])

AttributeError: 'Series' object has no attribute 'translate'

In [36]:
#To apply the same code on entire column
df['review']=df['review'].apply(remove_punc1)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Chat word treatment

In [30]:
shorthand=pd.read_csv('shorthand_chat.csv')
shorthand

Unnamed: 0,Abbreviations,Text
0,lol,laughing
1,rofl,laughing
2,brb,be right back
3,ily,i love you
4,ty,thank you
...,...,...
109,XD,laugh
110,xoxo,hugs and kisses
111,xo,hugs and kisses
112,y,why


In [31]:
mydict = dict((rows[0],rows[1]) for rows in shorthand)

In [32]:
mydict

{'A': 'b', 'T': 'e'}

In [11]:
shorthand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Abbreviations  114 non-null    object
 1   Text           114 non-null    object
dtypes: object(2)
memory usage: 1.9+ KB


In [42]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in shorthand['Abbreviations'].str.upper():
            new_text.append(shorthand["Text"])
        else:
            new_text.append(w)
    return ' '.join(new_text)

In [43]:
chat_conversion('lol he is the best')

'lol he is the best'

# Spelling correction

In [72]:
#pip install -U textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [73]:
from textblob import TextBlob

In [76]:
incorrect_text='ceertain coonditions durring ggenreations aree moodified in the same manner'
textBlb=TextBlob(incorrect_text) #object creation
textBlb.correct().string

'certain conditions during generations are modified in the same manner'

# Removing Stop words

In [126]:
import nltk
from nltk.corpus import stopwords

In [127]:
stopwords.words('english')

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\sayan/nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\sayan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [83]:
def remove_stopwords(text):
    new_text=[]
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x=new_text[:]
    new_text.clear()
    return ' '.join(x)

In [84]:
remove_stopwords('probalby my all time favourite movie, a story of selflessness')

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\sayan/nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\sayan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [85]:
#To apply the same to on entire column
df['review']=df['review'].apply(remove_stopwords)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\sayan/nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\sayan\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\sayan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


# Dealing with emojis

In [88]:
#This is the code for removing emojis
#Simple copy-paste this without any shange

import re
def remove_emoji(text):
    emoji_pattern=re.compile("["
                             u"\U0001F600-\U0001F64F"
                             u"\U0001F300-\U0001F5FF"
                             u"\U0001F680-\U0001F6FF"
                             u"\U0001F1E0-\U0001F1FF"
                             u"\U00002702-\U000027B0"
                             u"\U000024C2-\U0001F251"
                             "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [90]:
#Replacing emoji with its meaning
import emoji

ModuleNotFoundError: No module named 'emoji'

In [91]:
#pip install emoji

Collecting emojiNote: you may need to restart the kernel to use updated packages.

  Downloading emoji-1.7.0.tar.gz (175 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py): started
  Building wheel for emoji (setup.py): finished with status 'done'
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171030 sha256=99dd8839c028a4b3ff8d0d8b408433d84b93a0c21ea327f6154d15dd83c66a72
  Stored in directory: c:\users\sayan\appdata\local\pip\cache\wheels\5e\8c\80\c3646df8201ba6f5070297fe3779a4b70265d0bfd961c15302
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [93]:
import emoji
print(emoji.demojize('Python is '))

Python is 


# Tokenization

1. Using the split function

In [94]:
#Word Tokenization
sent1='I am woing to Delhi'
sent1.split()

['I', 'am', 'woing', 'to', 'Delhi']

In [96]:
#Sentence tokenization
sent2='I am going to Delhi. I will stay there for 3 days. Lets hope the trip to be great'
sent2.split('.')

['I am going to Delhi',
 ' I will stay there for 3 days',
 ' Lets hope the trip to be great']

In [98]:
#Problems associated with split function
sent3="I am going to Delhi!"
sent3.split()

#The '!' is coming with 'Delhi'. Hnece if we encounter 'Delhi' next time, 'Delhi' and 'Delhi!' will be considered as separate

['I', 'am', 'going', 'to', 'Delhi!']

In [102]:
sent4='Where do you think we should go? I have a 3 day holiday'
sent4.split('.')

['Where do you think we should go? I have a 3 day holiday']

2. Regular Expression

In [105]:
import re
sent4='I am going to Delhi!'
tokens=re.findall("[\w']+",sent4)
tokens

#Note that the exclamation mark has been removed
#But unfortunately we are not getting the '!'

['I', 'am', 'going', 'to', 'Delhi']

In [106]:
text="""Lorem Ipsum is  simply dummy text od the printing and typesetting industry?
Lorem Ispum has been the industry's summy text ever since the 1500s,
when an unknown printer took a galley of the type and scrambled it to make a type precision book"""

sentences=re.compile('[.!?]').split(text)
sentences

['Lorem Ipsum is  simply dummy text od the printing and typesetting industry',
 "\nLorem Ispum has been the industry's summy text ever since the 1500s,\nwhen an unknown printer took a galley of the type and scrambled it to make a type precision book"]

In [113]:
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [114]:
sent4='I am going to Delhi!'
word_tokenize(sent4)

['I', 'am', 'going', 'to', 'Delhi', '!']

In [119]:
sent5='I have a Ph.D in A.I'
sent6="We're here to help! mail us at nks@gmail.com"
sent7="A 5km ride cost $10.50"

In [116]:
word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [117]:
word_tokenize(sent6)

#It has separated the Email id

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'nks',
 '@',
 'gmail.com']

In [120]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

4.Spacy

In [129]:
!pip install spacy



In [130]:
import spacy
nlp=spacy.load('en_core_web_sn')

OSError: [E050] Can't find model 'en_core_web_sn'. It doesn't seem to be a Python package or a valid path to a data directory.

# Stemming And Lemmetization

In [131]:
#PorterStemmer
#SnowBallStemmer

In [133]:
from nltk.stem.porter import PorterStemmer 

ps=PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [140]:
sample='walk walks walking walked'
stem_words(sample)

'walk walk walk walk'

In [144]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [147]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_= WordNetLemmatizer()

sentence=' He was running and eating at the same time. He has a bad habit of swimming after playing long hours in the sun'
punctuations="?:!.,;"
sentence_words=nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

print("{0:20}{1:20}".format('Word','Lemma'))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_.lemmatize(word,pos='v'))) #pos='v' indicates parts of speech is verb
    
#pos must be specified

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
the                 the                 
same                same                
time                time                
He                  He                  
has                 have                
a                   a                   
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
sun                 sun                 
