In [None]:
import pandas as pd
import string
import numpy as np

In [None]:
dataset = pd.read_csv('../input/spamraw.csv')

**PREPROCESSING**
There are some punctuation in SMS text. We need to get rid of these to make implementation easy. I also make text lower case not to have any problem 

In [None]:
def text_preprocess(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = str.lower(text)
    return text

Simply applying my function on all text datas in my dataset

In [None]:
dataset.text = dataset.text.apply(text_preprocess)

I specified some words that I see these are used in spam text mostly. These words are **win** , **prize** , **award** and **free**

In [None]:
words = ['win','prize','award','free']

I divide my dataset into two sets which are train set (80% of dataset) and test set (20% of dataset). I also divide my train set into two sets which are spam set and nonspam set (I prefer to call 'ham' as 'nonspam')

In [None]:
dataset_count = dataset.text.count()
train_set = dataset.head(int(dataset_count * 0.8))
test_set = dataset.tail(dataset_count - int(dataset_count * 0.8))
spam_set = train_set[train_set['type'] == 'spam']
nonspam_set = train_set[train_set['type'] == 'ham']

I count lengths of spam set and nonspam set to use these values for calculating probabilites which are needed for Bayessian approach

In [None]:
spam_count = spam_set.text.count()
nonspam_count = nonspam_set.text.count()
print('Spam count: ' + str(spam_count))
print('Nonspam count: ' + str(nonspam_count))

I implement a function to determine if given text has a specified word inside or not

In [None]:
def check_word(word,text):
    if word in text:
        return True
    return False

I check my four words if they are inside text or not and keep this result in my sets 

In [None]:
for i in words:
    spam_set['has' + i] = np.vectorize(check_word)(i, spam_set.text)    
    nonspam_set['has' + i] = np.vectorize(check_word)(i, nonspam_set.text)    

Let's check the sets if everything is OK

In [None]:
spam_set.head()

In [None]:
nonspam_set.head()

From now, I know there is easy way to calculate but I just hard coded

Calculating probabilites for Bayes p(win|spam) and p(win|nospam). Applied these for remaining three words

In [None]:
p_win_spam = spam_set[spam_set['haswin'] == True].haswin.count() / float(spam_count)
p_win_nonspam = nonspam_set[nonspam_set['haswin'] == True].haswin.count() / float(nonspam_count)

In [None]:
p_prize_spam = spam_set[spam_set['hasprize'] == True].hasprize.count() / float(spam_count)
p_prize_nonspam = nonspam_set[nonspam_set['hasprize'] == True].hasprize.count() / float(nonspam_count)

In [None]:
p_award_spam = spam_set[spam_set['hasaward'] == True].hasaward.count() / float(spam_count)
p_award_nonspam = nonspam_set[nonspam_set['hasaward'] == True].hasaward.count() / float(nonspam_count)

In [None]:
p_free_spam = spam_set[spam_set['hasfree'] == True].hasfree.count() / float(spam_count)
p_free_nonspam = nonspam_set[nonspam_set['hasfree'] == True].hasfree.count() / float(nonspam_count)

Calculating the probabilites p(spam) and p(nonspam)

In [None]:
p_spam = spam_count / float(spam_count + nonspam_count)
p_nonspam = nonspam_count / float(spam_count + nonspam_count)

Calculating the probabilities p(win) , p(prize) , p(award) and p(free) 

In [None]:
p_win = spam_set[spam_set['haswin'] == True].haswin.count() / float(spam_count + nonspam_count)
p_prize = spam_set[spam_set['hasprize'] == True].hasprize.count() / float(spam_count + nonspam_count)
p_award = spam_set[spam_set['hasaward'] == True].hasaward.count() / float(spam_count + nonspam_count)
p_free = spam_set[spam_set['hasfree'] == True].hasfree.count() / float(spam_count + nonspam_count)

Calculating the probabilities p(spam|win) and p(nonspam|win) and applying this for remaining three words

In [None]:
p_spam_win = (p_win_spam * p_spam)/float(p_win)
p_nonspam_win = (p_win_nonspam * p_nonspam)/float(p_win)

In [None]:
p_spam_prize = (p_prize_spam * p_spam)/float(p_win)
p_nonspam_prize = (p_prize_nonspam * p_nonspam)/float(p_win)

In [None]:
p_spam_award = (p_award_spam * p_spam)/float(p_win)
p_nonspam_award = (p_award_nonspam * p_nonspam)/float(p_win)

In [None]:
p_spam_free = (p_free_spam * p_spam)/float(p_win)
p_nonspam_free = (p_free_nonspam * p_nonspam)/float(p_win)

Looking at my test set before the action :)

In [None]:
test_set.head()

Implementing a function to predict if given text is spam or not. It is checking if it has four words specified before. If it has one of these, it compares probability of being spam or not for selected word. Otherwise, it just assumes text is nonspam. Because if you cannot predict something, you cannot just block it. User wants to know what it is

In [None]:
def predict(text):
    if 'win' in text:
        if p_spam_win > p_nonspam_win:
            return 'spam'
        else:
            return 'ham'
    elif 'prize' in text:
        if p_spam_prize > p_nonspam_prize:
            return 'spam'
        else:
            return 'ham'
    elif 'award' in text:
        if p_spam_award > p_nonspam_award:
            return 'spam'
        else:
            return 'ham'
    elif 'free' in text:
        if p_spam_free > p_nonspam_free:
            return 'spam'
        else:
            return 'ham'
    else:
        return 'ham'

Predicting...

In [None]:
test_set['predict'] = test_set.text.apply(predict)

* Implementing a function to keep result of prediction in the set

In [None]:
def result_prediction(ideal,predict):
    if ideal == predict:
        return True
    else:
        return False

Before calculating accuracy, let's look at test set

In [None]:
test_set.head(15)

Applying prediction result function

In [None]:
test_set['result'] = np.vectorize(result_prediction)(test_set.type, test_set.predict)    

In [None]:
test_set.head()

Counting True and False results

In [None]:
true_pre = test_set[test_set['result'] == True].result.count()
false_pre = test_set[test_set['result'] == False].result.count()

Calculating accuracy score

In [None]:
accuracy_score = float(true_pre) / (true_pre + false_pre)
print('Accuracy Score: ' + str(accuracy_score))

**NOTE**

This kernel is for beginners to give them an idea about the topic. Here I declared my own words like 'win' or 'award'. But in reality system should detect new words to update word set regularly for increasing performance.