<a href="https://colab.research.google.com/github/sahug/ds-nlp/blob/main/NLP%20-%20Session%2023%20-%20Data%20Preprocessing%20and%20Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Session 23 - Data Preprocessing and Cleaning**

## **Punctuation Removal**

In [1]:
import pandas as pd
sample_data = ["I like English.",
               "I speak English, French and Thai.",
               "I don't often go swimming; I prefer to play tennis.",
               "You have two choices: finish the work today or lose the contract.",
               "This is a rather out-of-date book.",
               "In each town—London, Paris and Rome—we stayed in youth hostels.",
               "Where is Shangri-La?",
               "Help! she cried. I can't swim!",
               "Please press your browser's Refresh/Reload button.",
               "C:\Files\jo.doc",
               "'I love you,' she said.",
               "This is John's car.",
               "Have you read Harry Potter?",
               "b_l@cia.gov",
               "I went to Bangkok (my favourite city) and stayed there for two weeks.",
               "The newspaper reported that the hostages [most of them French] had been released.",
               'One happy customer wrote: "This is the best program...that I have ever seen."'
               ]
sample_df = pd.DataFrame(sample_data, columns=["original"])               

In [2]:
import string
def preprocess_punctuation(text: str) -> str:
  return "".join([i for i in text if i not in string.punctuation])  
sample_df['processed'] = sample_df['original'].apply(lambda x:preprocess_punctuation(x))  
sample_df

Unnamed: 0,original,processed
0,I like English.,I like English
1,"I speak English, French and Thai.",I speak English French and Thai
2,I don't often go swimming; I prefer to play te...,I dont often go swimming I prefer to play tennis
3,You have two choices: finish the work today or...,You have two choices finish the work today or ...
4,This is a rather out-of-date book.,This is a rather outofdate book
5,"In each town—London, Paris and Rome—we stayed ...",In each town—London Paris and Rome—we stayed i...
6,Where is Shangri-La?,Where is ShangriLa
7,Help! she cried. I can't swim!,Help she cried I cant swim
8,Please press your browser's Refresh/Reload but...,Please press your browsers RefreshReload button
9,C:\Files\jo.doc,CFilesjodoc


## **Tokenization**

In [3]:
from nltk.tokenize import RegexpTokenizer
def tokenization(text):
  return RegexpTokenizer("\w+").tokenize(text)
sample_df['processed'] = sample_df['original'].apply(lambda x:tokenization(x))  
sample_df  

Unnamed: 0,original,processed
0,I like English.,"[I, like, English]"
1,"I speak English, French and Thai.","[I, speak, English, French, and, Thai]"
2,I don't often go swimming; I prefer to play te...,"[I, don, t, often, go, swimming, I, prefer, to..."
3,You have two choices: finish the work today or...,"[You, have, two, choices, finish, the, work, t..."
4,This is a rather out-of-date book.,"[This, is, a, rather, out, of, date, book]"
5,"In each town—London, Paris and Rome—we stayed ...","[In, each, town, London, Paris, and, Rome, we,..."
6,Where is Shangri-La?,"[Where, is, Shangri, La]"
7,Help! she cried. I can't swim!,"[Help, she, cried, I, can, t, swim]"
8,Please press your browser's Refresh/Reload but...,"[Please, press, your, browser, s, Refresh, Rel..."
9,C:\Files\jo.doc,"[C, Files, jo, doc]"


## **Stop Words**

In [4]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

def remove_stop_words(text):
  output =  [i for i in text if i not in stopwords]
  return output
sample_df['stopwords'] = sample_df['processed'].apply(lambda x:remove_stop_words(x))  
sample_df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,original,processed,stopwords
0,I like English.,"[I, like, English]","[I, like, English]"
1,"I speak English, French and Thai.","[I, speak, English, French, and, Thai]","[I, speak, English, French, Thai]"
2,I don't often go swimming; I prefer to play te...,"[I, don, t, often, go, swimming, I, prefer, to...","[I, often, go, swimming, I, prefer, play, tennis]"
3,You have two choices: finish the work today or...,"[You, have, two, choices, finish, the, work, t...","[You, two, choices, finish, work, today, lose,..."
4,This is a rather out-of-date book.,"[This, is, a, rather, out, of, date, book]","[This, rather, date, book]"
5,"In each town—London, Paris and Rome—we stayed ...","[In, each, town, London, Paris, and, Rome, we,...","[In, town, London, Paris, Rome, stayed, youth,..."
6,Where is Shangri-La?,"[Where, is, Shangri, La]","[Where, Shangri, La]"
7,Help! she cried. I can't swim!,"[Help, she, cried, I, can, t, swim]","[Help, cried, I, swim]"
8,Please press your browser's Refresh/Reload but...,"[Please, press, your, browser, s, Refresh, Rel...","[Please, press, browser, Refresh, Reload, button]"
9,C:\Files\jo.doc,"[C, Files, jo, doc]","[C, Files, jo, doc]"


In [5]:
sample_df = sample_df.drop("stopwords", axis=1)  

## **Stemming**

In [6]:
from nltk.stem import PorterStemmer
def stemming(text):
  return [PorterStemmer().stem(i) for i in text]

sample_df['stemmed'] = sample_df['processed'].apply(lambda x:stemming(x))  
sample_df  

Unnamed: 0,original,processed,stemmed
0,I like English.,"[I, like, English]","[i, like, english]"
1,"I speak English, French and Thai.","[I, speak, English, French, and, Thai]","[i, speak, english, french, and, thai]"
2,I don't often go swimming; I prefer to play te...,"[I, don, t, often, go, swimming, I, prefer, to...","[i, don, t, often, go, swim, i, prefer, to, pl..."
3,You have two choices: finish the work today or...,"[You, have, two, choices, finish, the, work, t...","[you, have, two, choic, finish, the, work, tod..."
4,This is a rather out-of-date book.,"[This, is, a, rather, out, of, date, book]","[thi, is, a, rather, out, of, date, book]"
5,"In each town—London, Paris and Rome—we stayed ...","[In, each, town, London, Paris, and, Rome, we,...","[in, each, town, london, pari, and, rome, we, ..."
6,Where is Shangri-La?,"[Where, is, Shangri, La]","[where, is, shangri, la]"
7,Help! she cried. I can't swim!,"[Help, she, cried, I, can, t, swim]","[help, she, cri, i, can, t, swim]"
8,Please press your browser's Refresh/Reload but...,"[Please, press, your, browser, s, Refresh, Rel...","[pleas, press, your, browser, s, refresh, relo..."
9,C:\Files\jo.doc,"[C, Files, jo, doc]","[c, file, jo, doc]"


In [7]:
sample_df = sample_df.drop("stemmed", axis=1)  

## **Lemmatization**

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

def lemmatize(text):
  return [WordNetLemmatizer().lemmatize(i) for i in text]

sample_df['lemma'] = sample_df['processed'].apply(lambda x:lemmatize(x))  
sample_df    

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,original,processed,lemma
0,I like English.,"[I, like, English]","[I, like, English]"
1,"I speak English, French and Thai.","[I, speak, English, French, and, Thai]","[I, speak, English, French, and, Thai]"
2,I don't often go swimming; I prefer to play te...,"[I, don, t, often, go, swimming, I, prefer, to...","[I, don, t, often, go, swimming, I, prefer, to..."
3,You have two choices: finish the work today or...,"[You, have, two, choices, finish, the, work, t...","[You, have, two, choice, finish, the, work, to..."
4,This is a rather out-of-date book.,"[This, is, a, rather, out, of, date, book]","[This, is, a, rather, out, of, date, book]"
5,"In each town—London, Paris and Rome—we stayed ...","[In, each, town, London, Paris, and, Rome, we,...","[In, each, town, London, Paris, and, Rome, we,..."
6,Where is Shangri-La?,"[Where, is, Shangri, La]","[Where, is, Shangri, La]"
7,Help! she cried. I can't swim!,"[Help, she, cried, I, can, t, swim]","[Help, she, cried, I, can, t, swim]"
8,Please press your browser's Refresh/Reload but...,"[Please, press, your, browser, s, Refresh, Rel...","[Please, press, your, browser, s, Refresh, Rel..."
9,C:\Files\jo.doc,"[C, Files, jo, doc]","[C, Files, jo, doc]"
