In [7]:
# Solution for Mac issue  only
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/seb/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string


##### CSV READING with PANDAS

df = pd.read_csv("SMS_test.csv", encoding="unicode_escape")
df.head()
column = "Message_body"
content = df[column]


##### NLTK POINTS

nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


##### A. CASE NORMALIZATION

## Lowercasing
def text_lowercase(text):
    return text.lower()


##### B. NOISE REMOVAL

## Removing numbers/digits
def remove_numbers(text):
    result = re.sub(r"\d+", "", text)
    return result

## Removing punctuation & special characters
punct_list = list(string.punctuation)
def remove_punctuation(text):
    for punct in punct_list:
        text = text.replace(punct, "")
    return text.strip() # remove leading and trialing whitespaces

## Removing double whitespaces
def remove_doubleWhitespaces(text):
    return " ".join(text.split())

## Removing URLS
url_pattern=[]
url_pattern.append(re.compile(r'https:?//\S+'))
url_pattern.append(re.compile(r'http:?//\S+'))
url_pattern.append(re.compile(r'www.\S+'))

def remove_urls(text):
    for pattern in url_pattern:
        text = pattern.sub("",text)
    return text


##### C. TOKENIZATION
def tokenize(text):
    return nltk.word_tokenize(text)


##### D. STOPWORDS REMOVAL
def remove_stopwords(list_words):
    purged_list = []
    for word in list_words:
        if word.lower() not in stop_words:
            purged_list.append(word)
    return purged_list


##### E-1 STEMMING
def stemming(list_words):
    stems = []
    for word in list_words:
        stems.append(stemmer.stem(word))
    return stems

##### E-2 LEMMATIZATION
def func_lemmatize(list_words):
    lemmas = []
    for word in list_words:
        lemmas.append(lemmatizer.lemmatize(word))
    return lemmas

##### ALL FUNCTIONS in one list

NR_functions = [
    remove_numbers,
    remove_punctuation,
    remove_doubleWhitespaces,
    remove_urls
]

##### DATAFRAME

# Column 1
df["Case Normalization"] = df[column].apply(text_lowercase)

# Column 2
df["Noise Removal"] = df["Case Normalization"] # In-between column to apply all NR functions
for funct in NR_functions:
    df["Noise Removal"] = df["Noise Removal"].apply(funct)

# Column 3
df["Tokenization"] = df["Noise Removal"].apply(tokenize)

# Column 4
df["Stopwords"] = df["Tokenization"].apply(remove_stopwords)

# Column 5
df["Stemming"] = df["Tokenization"].apply(stemming)

# Column 5
df["Lemmatization"] = df["Tokenization"].apply(func_lemmatize)


##### DISPLAYING
display(df.head())

[nltk_data] Downloading package punkt to /Users/seb/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/seb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/seb/nltk_data...


Unnamed: 0,S. No.,Message_body,Label,Case Normalization,Noise Removal,Tokenization,Stopwords,Stemming,Lemmatization
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam,"upgrdcentre orange customer, you may now claim...",upgrdcentre orange customer you may now claim ...,"[upgrdcentre, orange, customer, you, may, now,...","[upgrdcentre, orange, customer, may, claim, fr...","[upgrdcentr, orang, custom, you, may, now, cla...","[upgrdcentre, orange, customer, you, may, now,..."
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam,"loan for any purpose £500 - £75,000. homeowner...",loan for any purpose £ £ homeowners tenants we...,"[loan, for, any, purpose, £, £, homeowners, te...","[loan, purpose, £, £, homeowners, tenants, wel...","[loan, for, ani, purpos, £, £, homeown, tenant...","[loan, for, any, purpose, £, £, homeowner, ten..."
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam,congrats! nokia 3650 video camera phone is you...,congrats nokia video camera phone is your call...,"[congrats, nokia, video, camera, phone, is, yo...","[congrats, nokia, video, camera, phone, call, ...","[congrat, nokia, video, camera, phone, is, you...","[congrats, nokia, video, camera, phone, is, yo..."
3,4,URGENT! Your Mobile number has been awarded wi...,Spam,urgent! your mobile number has been awarded wi...,urgent your mobile number has been awarded wit...,"[urgent, your, mobile, number, has, been, awar...","[urgent, mobile, number, awarded, £, prize, gu...","[urgent, your, mobil, number, ha, been, award,...","[urgent, your, mobile, number, ha, been, award..."
4,5,Someone has contacted our dating service and e...,Spam,someone has contacted our dating service and e...,someone has contacted our dating service and e...,"[someone, has, contacted, our, dating, service...","[someone, contacted, dating, service, entered,...","[someon, ha, contact, our, date, servic, and, ...","[someone, ha, contacted, our, dating, service,..."
