In [1]:
import numpy as np
import pandas as pd 
import regex as re

In [2]:
doc0='''Peter Piper picked a peck of pickled peppers.
A peck of pickled peppers Peter Piper picked.
If Peter Piper picked a peck of pickled peppers,
where’s the peck of pickled peppers Peter Piper picked?'''

In [3]:
doc1='''If you must cross a course cross cow across a crowded cow crossing,
cross the cross coarse cow across the crowded cow crossing carefully.'''

In [4]:
doc2=''' If one doctor doctors another doctor, 
then which doctor is doctoring the doctored doctor?
Does the doctor who doctors the doctor, 
doctor the doctor the way the doctor is doctoring doctors?'''

In [5]:
doc3='''To sit in solemn silence in a dull, dark dock
in a pestilential prison with a life-long lock,
awaiting the sensation of a short, sharp shock
from a cheap and chippy chopper with a big, black block.'''

In [6]:
doc4='''Betty Botter bought a bit of butter.
The butter Betty Botter bought was a bit bitter
And made her batter bitter.
But a bit of better butter makes better batter.
So Betty Botter bought a bit of better butter
Making Betty Botter’s bitter batter better'''

In [7]:
doc5='''Through three cheese trees three free fleas flew.
While these fleas flew, freezy breeze blew.
Freezy breeze made these three trees freeze.
Freezy trees made these trees’ cheese freeze.
That’s what made these three free fleas sneeze.'''

In [8]:
dfori=pd.DataFrame({'docs':[doc0,doc1,doc2,doc3,doc4,doc5]})
dfori

Unnamed: 0,docs
0,Peter Piper picked a peck of pickled peppers.\...
1,If you must cross a course cross cow across a ...
2,"If one doctor doctors another doctor, \nthen ..."
3,"To sit in solemn silence in a dull, dark dock\..."
4,Betty Botter bought a bit of butter.\nThe butt...
5,Through three cheese trees three free fleas fl...


In [9]:
df=dfori.copy()


## converting into uniform case

In [10]:
## df['docs'].str.lower()

In [11]:
def lowerCaseConversion(x):
    return x.str.lower()

In [12]:
df['docs']=lowerCaseConversion(df['docs'])
df

Unnamed: 0,docs
0,peter piper picked a peck of pickled peppers.\...
1,if you must cross a course cross cow across a ...
2,"if one doctor doctors another doctor, \nthen ..."
3,"to sit in solemn silence in a dull, dark dock\..."
4,betty botter bought a bit of butter.\nthe butt...
5,through three cheese trees three free fleas fl...


## handling html tags

In [13]:
x='<b>Hello</b>'
re.sub(r'<.*?>','',x)

def removeHtmlTags(x):
    return re.sub(r'<.*?>','',x)

In [14]:
df['docs']=df['docs'].apply(removeHtmlTags)
df

Unnamed: 0,docs
0,peter piper picked a peck of pickled peppers.\...
1,if you must cross a course cross cow across a ...
2,"if one doctor doctors another doctor, \nthen ..."
3,"to sit in solemn silence in a dull, dark dock\..."
4,betty botter bought a bit of butter.\nthe butt...
5,through three cheese trees three free fleas fl...


## removing urls

In [15]:
re.sub(r'https?://\s+|www\.s\+','',x)


def removeurls(x):
    return re.sub(r'https?://\s+|www\.s\+','',x)

df['docs']=df['docs'].apply(removeurls)
df['docs']

0    peter piper picked a peck of pickled peppers.\...
1    if you must cross a course cross cow across a ...
2     if one doctor doctors another doctor, \nthen ...
3    to sit in solemn silence in a dull, dark dock\...
4    betty botter bought a bit of butter.\nthe butt...
5    through three cheese trees three free fleas fl...
Name: docs, dtype: object

In [16]:
## handling new  line characters

In [17]:
def handlingNewlineChar(x):
    return re.sub(r'\n','',x)

df['docs']=df['docs'].apply(handlingNewlineChar)
df['docs'][0]

'peter piper picked a peck of pickled peppers.a peck of pickled peppers peter piper picked.if peter piper picked a peck of pickled peppers,where’s the peck of pickled peppers peter piper picked?'

In [18]:
## removing spechar and numbers

In [19]:
def removingspecharAndNumbers(x):
    return re.sub(r'[^a-zA-Z]',' ',x)

In [20]:
df['docs']=df['docs'].apply(removingspecharAndNumbers)
df['docs']

0    peter piper picked a peck of pickled peppers a...
1    if you must cross a course cross cow across a ...
2     if one doctor doctors another doctor  then wh...
3    to sit in solemn silence in a dull  dark docki...
4    betty botter bought a bit of butter the butter...
5    through three cheese trees three free fleas fl...
Name: docs, dtype: object

In [21]:
## removing stopwords

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alasy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def removeStopwords(x):
    z=[]
    for ele in x.split():
        if ele not in stopwords.words('english'):
            z.append(ele)
    return ' '.join(z)

In [23]:
df['docs']=df['docs'].apply(removeStopwords)
df['docs']

0    peter piper picked peck pickled peppers peck p...
1    must cross course cross cow across crowded cow...
2    one doctor doctors another doctor doctor docto...
3    sit solemn silence dull dark dockin pestilenti...
4    betty botter bought bit butter butter betty bo...
5    three cheese trees three free fleas flew fleas...
Name: docs, dtype: object

In [24]:
### stemming

In [25]:
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

In [26]:
stemmer.stem('walking')

'walk'

In [27]:
### lemitization

In [28]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alasy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alasy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [29]:
lem=WordNetLemmatizer()
lem.lemmatize('jogging')

'jogging'

In [30]:
def stem(x):
    s=[]
    for ele in x.split():
        s.append(stemmer.stem(ele))
    return ' '.join(s)

In [31]:
def lemmatizer(x):
    s=[]
    for ele in x.split():
        s.append(lem.lemmatize(ele))
    return ' '.join(s)

In [32]:
df['stemdocs']=df['docs'].apply(stem)
df

Unnamed: 0,docs,stemdocs
0,peter piper picked peck pickled peppers peck p...,peter piper pick peck pickl pepper peck pickl ...
1,must cross course cross cow across crowded cow...,must cross cours cross cow across crowd cow cr...
2,one doctor doctors another doctor doctor docto...,one doctor doctor anoth doctor doctor doctor d...
3,sit solemn silence dull dark dockin pestilenti...,sit solemn silenc dull dark dockin pestilenti ...
4,betty botter bought bit butter butter betty bo...,betti botter bought bit butter butter betti bo...
5,three cheese trees three free fleas flew fleas...,three chees tree three free flea flew flea fle...


In [33]:
df['lemdocs']=df['docs'].apply(lemmatizer)
df

Unnamed: 0,docs,stemdocs,lemdocs
0,peter piper picked peck pickled peppers peck p...,peter piper pick peck pickl pepper peck pickl ...,peter piper picked peck pickled pepper peck pi...
1,must cross course cross cow across crowded cow...,must cross cours cross cow across crowd cow cr...,must cross course cross cow across crowded cow...
2,one doctor doctors another doctor doctor docto...,one doctor doctor anoth doctor doctor doctor d...,one doctor doctor another doctor doctor doctor...
3,sit solemn silence dull dark dockin pestilenti...,sit solemn silenc dull dark dockin pestilenti ...,sit solemn silence dull dark dockin pestilenti...
4,betty botter bought bit butter butter betty bo...,betti botter bought bit butter butter betti bo...,betty botter bought bit butter butter betty bo...
5,three cheese trees three free fleas flew fleas...,three chees tree three free flea flew flea fle...,three cheese tree three free flea flew flea fl...
