In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
df = pd.read_csv("spam.csv", encoding = "latin-1", usecols=["v1", "v2"])
df.columns = ["label", "message"]

In [127]:
X = df["message"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.25)

cv = CountVectorizer(min_df = 0.01, max_features = 300, stop_words = 'english') #1% / 300 occurences
cv.fit(X_train)

X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [128]:
x_train = np.array(X_train.toarray())
x_test = np.array(X_test.toarray())

In [129]:
x_train.shape

(4179, 104)

In [130]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [131]:
probability_matrix = np.zeros((2, 104), dtype = float)

In [132]:
probability_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]])

In [133]:
total_sum_spam = 0
for x in range(len(y_train)):
  if(y_train[x] == "spam"):
    total_sum_spam += np.sum(x_train[x])
total_sum_ham = 0
for x in range(len(y_train)):
  if(y_train[x] == "ham"):
    total_sum_ham += np.sum(x_train[x])

In [134]:
total_sum_spam

2174

In [135]:
total_sum_ham

7862

In [136]:
alpha = 1
for x in range(len(x_train[0])):
  sum = 0
  for y in range(len(y_train)):
    if(y_train[y] == 'spam'):
      sum += x_train[y][x]
  probability_matrix[0][x] = (sum + alpha)/(total_sum_spam + len(x_train[0])*alpha)

In [137]:
alpha = 1
for x in range(len(x_train[0])):
  sum = 0
  for y in range(len(y_train)):
    if(y_train[y] == 'ham'):
      sum += x_train[y][x]
  probability_matrix[1][x] = (sum + alpha)/(total_sum_ham + len(x_train[0])*alpha)

In [138]:
probability_matrix

array([[0.02326602, 0.01931519, 0.00043898, 0.00043898, 0.00263389,
        0.00263389, 0.02414399, 0.03775241, 0.01755926, 0.00131694,
        0.01755926, 0.00043898, 0.00043898, 0.00877963, 0.00526778,
        0.00131694, 0.00043898, 0.00307287, 0.0048288 , 0.00043898,
        0.07638279, 0.00175593, 0.00395083, 0.00351185, 0.00395083,
        0.00043898, 0.00043898, 0.00921861, 0.00263389, 0.00570676,
        0.00131694, 0.00175593, 0.00219491, 0.02809482, 0.00877963,
        0.00087796, 0.00043898, 0.00131694, 0.00131694, 0.00175593,
        0.00526778, 0.00131694, 0.00043898, 0.00043898, 0.00351185,
        0.00043898, 0.00395083, 0.00307287, 0.00965759, 0.01273047,
        0.00307287, 0.04126427, 0.00219491, 0.00043898, 0.01931519,
        0.00395083, 0.0215101 , 0.00263389, 0.01009658, 0.00087796,
        0.00175593, 0.01492537, 0.00087796, 0.00175593, 0.00351185,
        0.02985075, 0.00087796, 0.03555751, 0.00087796, 0.00043898,
        0.00043898, 0.02326602, 0.00351185, 0.00

In [139]:
import math
def calc_probability(input_array, probability_matrix, Class):
  product = 1
  for x in range(len(x_train[0])):
    product = product * pow(probability_matrix[Class][x], input_array[x])
  if(Class == 0):
    return product*(550/4179)
  if(Class == 1):
    return product*(3629/4179)

In [140]:
x = 39
spam = calc_probability(x_test[x], probability_matrix, 0)
ham = calc_probability(x_test[x], probability_matrix, 1)
probability_spam = spam/(spam + ham)
probability_ham = ham/(spam + ham)
print("probability of spam: " + str(probability_spam))
print("probability of ham: " + str(probability_ham))
if(probability_spam > probability_ham):
  print("predicted result is: spam")
else:
  print("predicted result is: ham")
print("actaul result: " + str(y_test[x]))

probability of spam: 0.9999507712897499
probability of ham: 4.922871025008726e-05
predicted result is: spam
actaul result: spam


In [141]:
pr = np.zeros(len(y_test), dtype = object)
for x in range(len(y_test)):
  if(calc_probability(x_test[x], probability_matrix, 0) > calc_probability(x_test[x], probability_matrix, 1)):
    pr[x] = "spam"
  else:
    pr[x] = "ham"

In [142]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, pr)

In [143]:
score

0.9447236180904522

In [149]:
import pandas as pd

# initialize data of lists.
data = {'message': ["once in a life opportunity is waiting for you", "hi what is your name"]}

# Create DataFrame
df_test = pd.DataFrame(data)
df_test

Unnamed: 0,message
0,once in a life opportunity is waiting for you
1,hi what is your name


In [150]:
test = cv.transform(df_test["message"])

In [151]:
test = test.toarray()
test

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [152]:
spam = calc_probability(test[0], probability_matrix, 0)
ham = calc_probability(test[0], probability_matrix, 1)
probability_spam = spam/(spam + ham)
probability_ham = ham/(spam + ham)
print("probability of spam: " + str(probability_spam))
print("probability of ham: " + str(probability_ham))
if(probability_spam > probability_ham):
  print("predicted result is: spam")
else:
  print("predicted result is: ham")

probability of spam: 0.031120608335619394
probability of ham: 0.9688793916643806
predicted result is: ham
