6610402230 ศิริสุข ทานธรรม

In [1]:
import pandas as pd
import numpy as np

sms_spam_data_set = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])
print("Examples of the data samples \n", sms_spam_data_set.head(3), "\n")
print("Dimension of the data set:\n", sms_spam_data_set.shape, "\n")
print("Distribution of the data set:\n", sms_spam_data_set['Label'].value_counts(normalize=True), "\n")

Examples of the data samples 
   Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina... 

Dimension of the data set:
 (5572, 2) 

Distribution of the data set:
 Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64 



In [2]:
#Perform train/test split
sms_texts, labels = sms_spam_data_set.SMS, sms_spam_data_set.Label

from sklearn.model_selection import train_test_split
sms_texts_train, sms_texts_test, labels_train, labels_test = train_test_split(sms_texts, labels, test_size=0.2, random_state=123)

sms_texts_train = sms_texts_train.reset_index(drop=True)
labels_train = labels_train.reset_index(drop=True)

sms_texts_test = sms_texts_test.reset_index(drop=True)
labels_test = labels_test.reset_index(drop=True)

print("Distribtuion of the training data set:\n", labels_train.value_counts(normalize=True),  labels_train.shape[0], "\n")

print("Distribtuion of the testing data set:\n", labels_test.value_counts(normalize=True), labels_test.shape[0], "\n")

Distribtuion of the training data set:
 Label
ham     0.866726
spam    0.133274
Name: proportion, dtype: float64 4457 

Distribtuion of the testing data set:
 Label
ham     0.86278
spam    0.13722
Name: proportion, dtype: float64 1115 



In [3]:
#Function to convert input sms texts to feature vectors using BoW representation
def smsTextsToVectors(sms_texts):
    sms_texts = sms_texts.str.replace('\W', ' ') #Remove punctuation
    sms_texts = sms_texts.str.lower()
    sms_texts = sms_texts.str.split()

    vocabulary = []
    for sms in sms_texts:
        for word in sms:
            vocabulary.append(word)
        
    vocabulary = list(set(vocabulary))
    
    word_counts_per_sms = {unique_word: [0] * len(sms_texts) for unique_word in vocabulary}
    
    for index, sms in enumerate(sms_texts):
        for word in sms:
            word_counts_per_sms[word][index] += 1
            
    return word_counts_per_sms, vocabulary

word_counts_per_sms, vocabulary  = smsTextsToVectors(sms_texts_train) 
x_train = pd.DataFrame(word_counts_per_sms)
print("Features (the number of all possible words in the trainning data):\n", len(vocabulary), "\n")

training_data_set = pd.concat([labels_train, sms_texts_train, x_train], axis=1)


  sms_texts = sms_texts.str.replace('\W', ' ') #Remove punctuation


Features (the number of all possible words in the trainning data):
 11789 



In [4]:
print("Examples of the training data \n", training_data_set.head(3), "\n")

Examples of the training data 
   Label                                                SMS  tcs,  satisfy  \
0  spam  Double mins and txts 4 6months FREE Bluetooth ...     0        0   
1   ham  Did you get any gift? This year i didnt get an...     0        0   
2   ham  Ever green quote ever told by Jerry in cartoon...     0        0   

   crickiting  is..  foot  ?1,000  lily  penny  ...  depends.  those!  \
0           0     0     0       0     0      0  ...         0       0   
1           0     0     0       0     0      0  ...         0       0   
2           0     0     0       0     0      0  ...         0       0   

   recreation  sup,  typical  'doctors'  walk  ecstasy?  before?  motor  
0           0     0        0          0     0         0        0      0  
1           0     0        0          0     0         0        0      0  
2           0     0        0          0     0         0        0      0  

[3 rows x 11791 columns] 



In [5]:
x_train_spam = x_train[labels_train == 'spam']
x_train_ham = x_train[labels_train == 'ham']

#Estimate P(y=spam) and P(y=ham)
p_spam = len(x_train_spam)/len(x_train)
print("Our estimate of P(y=spam) is ", p_spam)

p_ham = len(x_train_ham)/len(x_train)
print("Our estimate of P(y=ham) is ", p_ham)



Our estimate of P(y=spam) is  0.13327350235584473
Our estimate of P(y=ham) is  0.8667264976441552


In [6]:
print(x_train.shape,len(vocabulary))
print(x_train[vocabulary[0]].sum())

(4457, 11789) 11789
2


In [7]:
#Initiate parameters
theta_spam = {unique_word:0 for unique_word in vocabulary}
theta_ham =  {unique_word:0 for unique_word in vocabulary}

#Estimate the probability distribution of selecting each word
# uncomment to implement the following
sum_spam_frequency = x_train_spam.sum(axis=1).sum()
sum_ham_frequency = x_train_ham.sum(axis=1).sum()
for word in vocabulary:
    # uncomment to implement the following
    sum_word_spam_frequency = x_train_spam[word].sum()
    theta_spam[word] = sum_word_spam_frequency/sum_spam_frequency
    # uncomment to implement the following
    sum_word_ham_frequency = x_train_ham[word].sum()
    theta_ham[word] = sum_word_ham_frequency/sum_ham_frequency

In [8]:
theta_spam,theta_ham

({'tcs,': 0.00014161297174821214,
  'satisfy': 0.00014161297174821214,
  'crickiting': 0.0,
  'is..': 0.0,
  'foot': 0.0,
  '?1,000': 7.080648587410607e-05,
  'lily': 7.080648587410607e-05,
  'penny': 0.0,
  'score': 0.0,
  'mila,': 0.00014161297174821214,
  'heat': 0.0,
  'through.': 0.0,
  'english': 0.0,
  'catches': 0.0,
  'lesson': 0.0,
  'thesedays': 0.0,
  'code:': 0.0006372583728669546,
  'rememberi': 0.0,
  'confirmed...': 0.0,
  'jeetey': 0.0,
  'see': 0.001062097288111591,
  'heads': 0.0,
  'radio': 0.0,
  'dhorte': 0.0,
  'moments': 0.0,
  'already,': 0.0,
  'retired': 0.0,
  'perform,': 0.0,
  'whereare': 0.0,
  'coach': 0.0,
  'emigrated': 0.0,
  '(get': 7.080648587410607e-05,
  'ibh': 7.080648587410607e-05,
  'book...': 0.0,
  'da:-)i': 0.0,
  'gamestar': 0.00014161297174821214,
  'itxt': 0.0,
  'eruku.': 0.0,
  'requests.': 0.0,
  'yourself,': 0.0,
  'zac': 0.0,
  'join...': 0.0,
  'punish': 0.0,
  'dan': 0.0,
  'humanities': 0.0,
  '1000': 0.0002124194576223182,
  'blo

In [9]:
#implment Naive Bayes classifier
import re, math
def textToVector(message):
    message = re.sub('\W', ' ', message) #Remove punctuation
    message = message.lower().split()

    vocabulary = []
    for word in  message:
        vocabulary.append(word)
        
    vocabulary = list(set(vocabulary))
    
    word_counts = {unique_word: 0 for unique_word in vocabulary}
    
    for word in message:
            word_counts[word] += 1
            
    return word_counts, vocabulary

def naive_bayes_classify(sms_text):
    x_test, vocabulary_test = textToVector(sms_text)
    
    # uncomment to implement the following
    sum_frequency = sum(x_test.values())
    log_p_spam_given_sms = [ math.comb(sum_frequency,x_test[word]) * pow((theta_spam[word] if word in theta_spam else 1e-10),x_test[word]) for word in vocabulary_test]
    log_p_spam_given_sms = sum([ math.log10(x if x != 0 else 1e-10) for x in log_p_spam_given_sms])
    
    log_p_ham_given_sms = [ math.comb(sum_frequency,x_test[word]) * pow((theta_ham[word] if word in theta_ham else 1e-10),x_test[word]) for word in vocabulary_test]
    log_p_ham_given_sms = sum([ math.log10(x if x != 0 else 1e-10) for x in log_p_ham_given_sms])

    print('Estimate of log(P(SPAM|message=',  sms_text, ')) =', log_p_spam_given_sms)
    print('Estimate of log(P(HAM|message=',  sms_text, ')) =', log_p_ham_given_sms)
    isSpam = True
    if(log_p_spam_given_sms > log_p_ham_given_sms):
        isSpam = True
    else:
        isSpam = False
    return isSpam
     

  message = re.sub('\W', ' ', message) #Remove punctuation


In [10]:
# x_test, vocabulary_test = textToVector("Sounds good, Tom, then u there")
x_test, vocabulary_test = textToVector("WINNER!! This is the secret code to unlock the money: C3421.")

In [11]:
sum(x_test.values())

11

In [12]:
vocabulary_test

['unlock',
 'c3421',
 'money',
 'to',
 'the',
 'winner',
 'secret',
 'this',
 'code',
 'is']

In [13]:
sum_frequency = sum(x_test.values())
word = vocabulary_test[-1]
print(word)
# sum([math.log10(math.comb(sum_frequency,x_test[word]) * pow(theta_spam[word],x_test[word])) for word in vocabulary_test if word in theta_spam])
# X = [theta_spam[word] for word in vocabulary_test if word in theta_spam]
# X = [math.log10(x) for x in X if x != 0]
X = [ math.comb(sum_frequency,x_test[word]) * pow((theta_spam[word] if word in theta_spam else 1e-10),x_test[word]) for word in vocabulary_test]
X = [ math.log10(x if x != 0 else 1e-10) for x in X]
X

is


[-8.958607314841775,
 -8.958607314841775,
 -2.631413019235703,
 -0.38262264166031723,
 -2.1676919239144694,
 -2.1542917645160404,
 -2.205444286963422,
 -1.2889903384134966,
 -1.9945909216485285,
 -1.0293530279077405]

In [14]:
p_spam

0.13327350235584473

In [15]:
print(naive_bayes_classify("WINNER!! This is the secret code to unlock the money: C3421."))
print(naive_bayes_classify("Sounds good, Tom, then u there"))

Estimate of log(P(SPAM|message= WINNER!! This is the secret code to unlock the money: C3421. )) = -31.771612553943267
Estimate of log(P(HAM|message= WINNER!! This is the secret code to unlock the money: C3421. )) = -41.74157740294062
True
Estimate of log(P(SPAM|message= Sounds good, Tom, then u there )) = -28.545194060554188
Estimate of log(P(HAM|message= Sounds good, Tom, then u there )) = -13.630981900746471
False


In [16]:
def score(sms_texts, labels):
    mistakes = 0
    for i, message in enumerate(sms_texts):
        isSpam = naive_bayes_classify(message)
        if isSpam and labels[i] != "spam":
            mistakes += 1
        elif not isSpam and labels[i] == "spam":
            mistakes += 1
    return (len(sms_texts)-mistakes)/len(sms_texts)
    

In [17]:
#Calculate loss on training data
print("Training accuracy:", score(sms_texts_train, labels_train))




Estimate of log(P(SPAM|message= Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX )) = -48.004957469730535
Estimate of log(P(HAM|message= Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX )) = -100.20471019558012
Estimate of log(P(SPAM|message= Did you get any gift? This year i didnt get anything. So bad )) = -40.04166246757207
Estimate of log(P(HAM|message= Did you get any gift? This year i didnt get anything. So bad )) = -21.548912697985955
Estimate of log(P(SPAM|message= Ever green quote ever told by Jerry in cartoon "A Person Who Irritates u Always Is the one Who Loves u Vry Much But Fails to Express It...!..!! :-) :-) gud nyt )) = -153.3733629686831
Estimate of log(P(HAM|message= Ever green quote ever told by Jerry in cartoon "A Person Who Irritates u Always Is the one Who Loves u Vry

In [18]:
#Calculate generalization loss
print("Generalization accuracy:", score(sms_texts_test, labels_test))

Estimate of log(P(SPAM|message= Aight text me when you're back at mu and I'll swing by, need somebody to get the door for me )) = -64.94299898648818
Estimate of log(P(HAM|message= Aight text me when you're back at mu and I'll swing by, need somebody to get the door for me )) = -31.404957026780426
Estimate of log(P(SPAM|message= Our Prashanthettan's mother passed away last night. pray for her and family. )) = -72.41036550579734
Estimate of log(P(HAM|message= Our Prashanthettan's mother passed away last night. pray for her and family. )) = -35.02679654703261
Estimate of log(P(SPAM|message= No it will reach by 9 only. She telling she will be there. I dont know )) = -49.66568049680433
Estimate of log(P(HAM|message= No it will reach by 9 only. She telling she will be there. I dont know )) = -24.489750379861878
Estimate of log(P(SPAM|message= Do you know when the result. )) = -18.889455734828694
Estimate of log(P(HAM|message= Do you know when the result. )) = -10.41827205229366
Estimate of l