In [1]:
import nltk
import re
import string
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# importing dataset
df = pd.read_csv('a01_spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# info about the dataset
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
#check for duplicates
dup = df[df.duplicated()]
dup.head()

Unnamed: 0,Category,Message
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls


In [5]:
# removing duplicates
df.drop_duplicates(inplace=True)
df.describe()

Unnamed: 0,Category,Message
count,5157,5157
unique,2,5157
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [6]:
# check null values
df.isnull().sum()

Category    0
Message     0
dtype: int64

### preprocessing complete, proceed with the tasks

# Task 1: Counting words

#### Tokenization of words using Regex tokenizer

#### Punctuation and numbers are removed for better identification of actual words.

In [7]:
# removing punctuation and numbers
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[a-zA-Z]+')

df['t1'] = df.Message.apply(tokenizer.tokenize)
df.head()

Unnamed: 0,Category,Message,t1
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, crazy, Available, o..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, a, wkly, comp, to, win, FA, ..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, don, t, think, he, goes, to, usf, he,..."


In [8]:
ccount = 0
vcount = 0
vowels = ('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U')
for words in df['t1']:
    for word in words:
        if(word.startswith(vowels)):
            vcount +=  1
        else:
            ccount += 1

print('Total number of words starting with vowels:', vcount)
print('Total number of words starting with consonants:', ccount)

Total number of words starting with vowels: 19923
Total number of words starting with consonants: 60086


# Task 2: Capitalised words
#### We separate the data for ham and spam messages

In [9]:
df_ham = df[df.Category == 'ham']
df_spam = df[df.Category == 'spam']
print('number of ham messages:', len(df_ham))
print('number of spam messages:', len(df_spam))

number of ham messages: 4516
number of spam messages: 641


In [10]:
# Checking ham messages
capcount = 0
count = 0

for words in df_ham['t1']:
    count += len(words)
    for word in words:
        if(word.isupper()):
            capcount +=  1

percent = capcount/count * 100
print('Percentage of capitalised word in ham messages: %f' % (percent), '%')

Percentage of capitalised word in ham messages: 6.656884 %


In [11]:
# Checking spam messages
capcount = 0
count = 0

for words in df_spam['t1']:
    count += len(words)
    for word in words:
        if(word.isupper()):
            capcount +=  1

percent = capcount/count * 100
print('Percentage of capitalised word in spam messages: %f' % (percent), '%')

Percentage of capitalised word in spam messages: 13.675624 %


# Task 3: Email IDs and Phone Numbers
#### Check original messages for email IDs and phone numbers

In [12]:
emails = {}
phone_nums = {}

mailregex = '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phoneregex = '[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}'

In [13]:
ham_mailcount = 0
ham_phonecount = 0

for message in df_ham['Message']:
    elist = re.findall(mailregex, message)
    plist = re.findall(phoneregex, message)
    if(len(elist)>0):
        ham_mailcount += 1
    if(len(plist)>0):
        ham_phonecount += 1

    # creating dictionary of emails and phone numbers
    for mail in elist:
        if(mail in emails):
            emails[mail] += 1
        else:
            emails[mail] = 1
    for pnum in plist:
        if(pnum in phone_nums):
            phone_nums[pnum] += 1
        else:
            phone_nums[pnum] = 1

epercent = ham_mailcount/len(df_ham) * 100
ppercent = ham_phonecount/len(df_ham) * 100
print('Percentage of ham messages containing emails: %f' % (epercent), '%')
print('Percentage of ham messages containing phone numbers: %f' % (ppercent), '%')

Percentage of ham messages containing emails: 0.022143 %
Percentage of ham messages containing phone numbers: 0.022143 %


In [14]:
spam_mailcount = 0
spam_phonecount = 0

for message in df_spam['Message']:
    elist = re.findall(mailregex, message)
    plist = re.findall(phoneregex, message)
    if(len(elist)>0):
        spam_mailcount += 1
    if(len(plist)>0):
        spam_phonecount += 1

    # creating dictionary of emails and phone numbers
    for mail in elist:
        if(mail in emails):
            emails[mail] += 1
        else:
            emails[mail] = 1
    for pnum in plist:
        if(pnum in phone_nums):
            phone_nums[pnum] += 1
        else:
            phone_nums[pnum] = 1

epercent = spam_mailcount/len(df_spam) * 100
ppercent = spam_phonecount/len(df_spam) * 100
print('Percentage of spam messages containing emails: %f' % (epercent), '%')
print('Percentage of spam messages containing phone numbers: %f' % (ppercent), '%')

Percentage of spam messages containing emails: 0.936037 %
Percentage of spam messages containing phone numbers: 56.474259 %


In [15]:
print('Total Number of emails found:', ham_mailcount+spam_mailcount)
print('In ham messages:', ham_mailcount)
print('In spam messages:', spam_mailcount)
print('Number of unique terms:', len(emails))
emails

Total Number of emails found: 7
In ham messages: 1
In spam messages: 6
Number of unique terms: 7


{'yijue@hotmail.com': 1,
 'info@ringtoneking.co.uk': 1,
 'tddnewsletter@emc1.co.uk': 1,
 'info@txt82228.co.uk': 1,
 'Dorothy@kiefer.com': 1,
 'msg+ticket@kiosk.Valid': 1,
 'customersqueries@netvision.uk.com': 1}

In [16]:
print('Total Number of phone numbers found:', ham_phonecount+spam_phonecount)
print('In ham messages:', ham_phonecount)
print('In spam messages:', spam_phonecount)
print('Number of unique terms:', len(phone_nums))
phone_nums

Total Number of phone numbers found: 363
In ham messages: 1
In spam messages: 362
Number of unique terms: 282


{'0125698789': 1,
 '08452810075': 1,
 '09061701461': 1,
 '08002986030': 1,
 '07732584351': 1,
 '08000930705': 16,
 '09061209465': 1,
 '09066364589': 1,
 '800 169 6031': 1,
 '09064012160': 1,
 '087127781091': 1,
 '07742676969': 1,
 '08719180248': 1,
 '09064019788': 1,
 '08712300220': 6,
 '087006211701': 3,
 '07046744435': 1,
 '087127781081': 1,
 '09061701939': 1,
 '808 145 4742': 1,
 '087187262701': 3,
 '447801259231': 1,
 '09058094597': 1,
 '845 021 3680': 2,
 '08002986906': 2,
 '08718720201': 7,
 '08708800282': 1,
 '08000839402': 12,
 '08717205546': 1,
 '09057039994': 1,
 '08000938767': 2,
 '845 2814032': 1,
 '09064012103': 1,
 '07123456789': 2,
 '09111032124': 1,
 '09058094455': 1,
 '09066382422': 1,
 '800 1956669': 1,
 '207 153 9153': 1,
 '09061743806': 2,
 '07781482378': 1,
 '871-872-9755': 1,
 '08717898035': 2,
 '07815296484': 1,
 '08718738001': 2,
 '09050000327': 2,
 '09050005321': 1,
 '08002988890': 1,
 '08715705022': 3,
 '08712402050': 2,
 '07753741225': 1,
 '08715203677': 1,
 

In [17]:
totalmailmessage = spam_mailcount + ham_mailcount
totalphonemessage = spam_phonecount + ham_phonecount

sepercent = spam_mailcount / totalmailmessage * 100
sppercent = spam_phonecount / totalphonemessage * 100

print('Percentage of total messages with emails that are spam: %f' %(sepercent), '%')
print('Percentage of total messages with phone numbers that are spam: %f' %(sppercent), '%')

Percentage of total messages with emails that are spam: 85.714286 %
Percentage of total messages with phone numbers that are spam: 99.724518 %


In [18]:
hepercent = ham_mailcount / totalmailmessage * 100
hppercent = ham_phonecount / totalphonemessage * 100

print('Percentage of total messages with emails that are ham: %f' %(hepercent), '%')
print('Percentage of total messages with phone numbers that are ham: %f' %(hppercent), '%')

Percentage of total messages with emails that are ham: 14.285714 %
Percentage of total messages with phone numbers that are ham: 0.275482 %


# Task 4: Counting Currencies
#### Check original messages for any currency symbols

In [303]:
moneyregex = '([\£\$\€\¥]{1}[ ]*[,\d]+\.?\d*)'
moneyregex = '([\£\$\€\¥]{1}[ ]*[,\d]+\.?\d*|[,\d]+\.?\d*[ ]*pounds?)'
spam_moneycount = 0
ham_moneycount = 0
mvalues = set()

In [304]:
for message in df_ham['Message']:
    mlist = re.findall(moneyregex, message)
    if(len(mlist)>0):
        ham_moneycount += 1

    # creating set of monetary values
    for money in mlist:
        mvalues.add(money)

mpercent = ham_moneycount/len(df_ham) * 100
print('Percentage of ham messages containing monetary values: %f' % (mpercent), '%')

Percentage of ham messages containing monetary values: 0.287865 %


In [305]:
for message in df_spam['Message']:
    mlist = re.findall(moneyregex, message)
    if(len(mlist)>0):
        spam_moneycount += 1

    # creating set of monetary values
    for money in mlist:
        mvalues.add(money)

mpercent = spam_moneycount/len(df_spam) * 100
print('Percentage of spam messages containing monetary values: %f' % (mpercent), '%')

Percentage of spam messages containing monetary values: 34.789392 %


In [306]:
print('Total Number of monetary terms found:', ham_moneycount+spam_moneycount)
print('In ham messages:', ham_moneycount)
print('In spam messages:', spam_moneycount)
print('Number of unique terms:', len(mvalues))

mvalues

Total Number of monetary terms found: 236
In ham messages: 13
In spam messages: 223
Number of unique terms: 71


{'$1',
 '$140',
 '$180',
 '$2',
 '$350',
 '$5.00',
 '$50',
 '$50.',
 '$700',
 '$900',
 '$900.',
 '$95',
 '2.50 pounds',
 '20,000 pounds',
 '2000 pound',
 '3 pound',
 '3750 pounds',
 '4 pounds',
 '500 pounds',
 '5000 pounds',
 '£1',
 '£1,50',
 '£1,500',
 '£1.',
 '£1.50',
 '£10',
 '£10,000',
 '£100',
 '£100,000',
 '£1000',
 '£1000.',
 '£12',
 '£125',
 '£1250',
 '£1450',
 '£150',
 '£1500',
 '£2,000',
 '£2.50',
 '£200',
 '£2000',
 '£250',
 '£3',
 '£3.00',
 '£3.75',
 '£3.99',
 '£33',
 '£33.65',
 '£350',
 '£38',
 '£4.50',
 '£400',
 '£450',
 '£48,',
 '£5',
 '£50',
 '£500',
 '£500.',
 '£5000',
 '£5000,',
 '£5000.00',
 '£54.',
 '£6',
 '£600.',
 '£7.',
 '£71.',
 '£75,000.',
 '£750',
 '£79',
 '£800',
 '£900'}

# Task 5: Counting Emojis
#### Check original messages for emoticons and print them

# storage
smileys = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^) 
             :D 8-D 8D x-D xD X-D XD =-D =D =-3 =3 B^D""".split()

In [345]:
emoticons = '[:;][)(](?![)(])'

In [338]:
from nltk.tokenize import TweetTokenizer
tk = TweetTokenizer()
df['t5'] = df.Message.apply(tk.tokenize)
df.head()

Unnamed: 0,Category,Message,t1,t5
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, crazy, Available, o...","[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, a, wkly, comp, to, win, FA, ...","[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, don, t, think, he, goes, to, usf, he,...","[Nah, I, don't, think, he, goes, to, usf, ,, h..."


In [346]:
for words in df['t5']:
    for word in words:
        emoticon = re.search(emoticons, word)
        if(emoticon):
            print(emoticon.group())

:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:(
:)
:)
:)
:)
:)
:)
:(
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
;)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:(
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:(
:)
:)
:)
:)
:(
;)
:)
:)
:)
:)
:)
:)
:)
:)
;)
:)
:)
;)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
;)
:)
:)
:(
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
;)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:(
:)
:)
:)
:(
:)
:)
:)
:)
:)
:)
:(
:)
:)
;)
:(
:)
:)
:)
:)
:)
:)
:)
:)
:)
;)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:)
:(
:(
:(


# Task 6: Counting Clitics
#### Check original messages for words with clitics and print them

In [402]:
s = 'sd ds dad pois'
x = re.search('dad\s', s)
x.group()

'dad '

In [464]:
cliticregex = '[a-zA-Z]+\'[a-zA-Z]{1,2}$'
clitic_words = set()

In [465]:
for words in df['t5']:
    for word in words:
        cword = re.search(cliticregex, word)
        if(cword):
            clitic_words.add(cword.group())

In [466]:
print('number of words with clitics:', len(clitic_words))
clitic_words

number of words with clitics: 219


{"C's",
 "Can't",
 "Carlos'll",
 "DIDN'T",
 "Didn't",
 "Dip's",
 "Doesn't",
 "Don't",
 "George's",
 "God's",
 "Gumby's",
 "Harish's",
 "Hasn't",
 "Haven't",
 "He's",
 "How's",
 "I'd",
 "I'll",
 "I'm",
 "I'ma",
 "I've",
 "Isn't",
 "It'll",
 "It's",
 "Jay's",
 "Joy's",
 "Ken's",
 "Let's",
 "Moon's",
 "Mumtaz's",
 "NY's",
 "PARTNER'S",
 "Party's",
 "Prashanthettan's",
 "Shahjahan's",
 "She'll",
 "She's",
 "T's",
 "THERE'S",
 "Ta's",
 "That'll",
 "That's",
 "There'll",
 "There're",
 "There's",
 "They're",
 "Today's",
 "U'll",
 "U're",
 "U've",
 "UK's",
 "UNICEF's",
 "VALENTINE'S",
 "WOULDN'T",
 "Wasn't",
 "Wat's",
 "We'd",
 "We'll",
 "We're",
 "We've",
 "What's",
 "When're",
 "When's",
 "Where's",
 "Wherre's",
 "Who's",
 "YOU'RE",
 "YOU'VE",
 "You'd",
 "You'll",
 "You're",
 "You've",
 "Zaher's",
 "account's",
 "ain't",
 "alex's",
 "all's",
 "anjola's",
 "anybody's",
 "anything's",
 "aren't",
 "armand's",
 "ashley's",
 "audrey's",
 "aunty's",
 "basket's",
 "bb's",
 "biola's",
 "blake's",
 "

# Task 7: Starting with
#### Check original messages and print those that start with a given word

In [495]:
word = input('Enter input word: ')
regex = '^'+word+'\s'

Enter input word: Go


In [498]:
count = 0
for message in df['Message']:
    match = re.search(regex, message)
    if(match):
        print(message)
        count += 1

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Go where n buy? Juz buy when we get there lar.
Go fool dont cheat others ok
Go chase after her and run her over while she's crossing the street


In [502]:
print('Number of messages starting with the word \'%s\' are - %d' %(word, count))

Number of messages starting with the word 'Go' are - 4


# Task 8: Ending with
#### Check original messages and print those that end with a given word

In [503]:
word = input('Enter input word: ')
regex = '\s'+word+'$'

Enter input word: ok


In [504]:
count = 0
for message in df['Message']:
    match = re.search(regex, message)
    if(match):
        print(message)
        count += 1

I asked you to call him now ok
Dont put your phone on silent mode ok
Mm have some kanji dont eat anything heavy ok
Send his number and give reply tomorrow morning for why you said that to him like that ok
Call him and say you not coming today ok and tell them not to fool me like this ok
dont make ne plans for nxt wknd coz she wants us to come down then ok
i can call in  &lt;#&gt;  min if thats ok
Go fool dont cheat others ok


In [505]:
print('Number of messages ending with the word \'%s\' are - %d' %(word, count))

Number of messages ending with the word 'ok' are - 8
