In [1]:
import nltk
import re
import string
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
# importing dataset
df = pd.read_csv('a01_spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# info about the dataset
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [12]:
#check for duplicates
dup = df[df.duplicated()]
dup.head()

Unnamed: 0,Category,Message
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls


In [13]:
# removing duplicates
df.drop_duplicates(inplace=True)
df.describe()

Unnamed: 0,Category,Message
count,5157,5157
unique,2,5157
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [14]:
# check null values
df.isnull().sum()

Category    0
Message     0
dtype: int64

### preprocessing complete, proceed with the tasks

# Task 1: Counting words

#### Tokenization of words using Regex tokenizer

#### Punctuation and numbers are removed for better identification of actual words.

In [25]:
# removing punctuation and numbers
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[a-zA-Z]+')

df['t1'] = df.Message.apply(tokenizer.tokenize)
df.head()

Unnamed: 0,Category,Message,t1
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, crazy, Available, o..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, a, wkly, comp, to, win, FA, ..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, don, t, think, he, goes, to, usf, he,..."


In [27]:
ccount = 0
vcount = 0
vowels = ('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U')
for words in df['t1']:
    for word in words:
        if(word.startswith(vowels)):
            vcount +=  1
        else:
            ccount += 1

print('Total number of words starting with vowels:', vcount)
print('Total number of words starting with consonants:', ccount)

Total number of words starting with vowels: 19923
Total number of words starting with consonants: 60086


# Task 2: Capitalised words
#### We separate the data for ham and spam messages

In [28]:
df_ham = df[df.Category == 'ham']
df_spam = df[df.Category == 'spam']
print('number of ham messages:', len(df_ham))
print('number of spam messages:', len(df_spam))

number of ham messages: 4516
number of spam messages: 641


In [38]:
# Checking ham messages
capcount = 0
count = 0

for words in df_ham['t1']:
    for word in words:
        if(word.isupper()):
            capcount +=  1
        count += 1

hpercent = capcount/count * 100
print('Percentage of capitalised word in ham messages: %f' % (hpercent), '%')

Percentage of capitalised word in ham messages: 6.656884 %


In [39]:
# Checking spam messages
capcount = 0
count = 0

for words in df_spam['t1']:
    for word in words:
        if(word.isupper()):
            capcount +=  1
        count += 1

spercent = capcount/count * 100
print('Percentage of capitalised word in spam messages: %f' % (spercent), '%')

Percentage of capitalised word in spam messages: 13.675624 %


# Task 3: Email IDs and Phone Numbers
#### Check original messages for email IDs and phone numbers

In [8]:
# creating the vocabulary
vocab = []
for index, row in df.iterrows():
    vocab = vocab + row['t1']

vocab = set(vocab)
print('number of unique words: ', len(vocab))

number of unique words:  7785
