In [None]:
# -*- coding: utf-8 -*-

# Importing the libraries
import numpy as np
import os
import io
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from math import sqrt
from math import pi
from math import exp


In [None]:
#NAIVE BAYES ALGORITHM
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated
 
# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)
 
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries
 
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries
 
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities
 
# Test calculating class probabilities
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)

{0: 0.05032427673372076, 1: 0.00011557718379945765}


In [None]:
# Creating functions to read data
def readData(path):
    for root, dirsnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            
            body = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                print(line)
                if body:
                    lines.append(line)
                elif line == '\n':
                    body = True
            print(lines)
            f.close()
            message = '\n'.join(lines)
            yield path, message

def getData(path, classification):
    rows = []
    for filename, message in readData(path):
        rows.append({'message': message, 'class': classification})

    return DataFrame(rows)

def readFromData(data_train):
    rows = []
    data_train_message = data_train['message'].values
    data_train_class = data_train['class'].values
    for i,j in zip(data_train_message, data_train_class):
        rows.append({'message': i, 'class': j})        
    return DataFrame(rows)
    
data = DataFrame({'message': [], 'class': []})
#p=" "
#dir = os.path.dirname(os.path.realpath(p))



In [None]:
data_spam = DataFrame({'message': [], 'class': []})
data_spam = data_spam.append(getData("/content/drive/My Drive/emails/spam", 'spam'))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Mon, 23 Sep 2002 16:24:10 +0100

Message-Id: <200209231524.QAA15255@webnote.net>

Received: from 184.244.108.80 ([184.244.108.80]) by rly-xr02.mx.aol.com

    with SMTP; Sep, 23 2002 9:13:52 AM +0300

Received: from unknown (6.61.10.17) by rly-xr02.mx.aol.com with NNFMP;

    Sep, 23 2002 8:13:09 AM -0200

Received: from 30.215.79.204 ([30.215.79.204]) by m10.grp.snv.yahoo.com

    with SMTP; Sep, 23 2002 7:06:08 AM +0700

Received: from ssymail.ssy.co.kr ([115.212.44.160]) by hd.regsoft.net with

    asmtp; Sep, 23 2002 6:25:55 AM +1100

From: "iaic_adv@hellerwhirligigs.com" <hlbi_adv@hellerwhirligigs.com>

To: zzzz@spamassassin.taint.org

Cc: 

Subject: Garden Ornaments | ppu

Sender: "iaic_adv@hellerwhirligigs.com" <hlbi_adv@hellerwhirligigs.com>

MIME-Version: 1.0

Content-Type: text/plain; charset="iso-8859-1"

Date: Mon, 23 Sep 2002 09:26:16 -0600

X-Mailer: Microsoft Outlook Express 5.50.4522.1200



Our deligh

In [None]:
data_ham = DataFrame({'message': [], 'class': []})
data_ham = data_ham.append(getData("/content/drive/My Drive/emails/ham", 'ham'))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

(a) Criminal Proceedings.--No criminal proceeding shall be maintained

under the provisions of this title unless it is commenced within five

years after the cause of action arose.



(b) Civil Actions.--No civil action shall be maintained under the

provisions of this title unless it is commenced within three years after

the claim accrued.



The penalties are too extensive to list here, but they can be found in

Section 2319: Criminal infringement of a copyright. In general, first-time

criminal offenses will carry a maximum prison sentence of 1 year.



I'm still not sure where the DOJ would start in choosing people to

prosecute because of the aforementioned "schooling" effect, but my guess

would be that, just like speeding, primarily the most prominent

individuals who operate large servers or transfer the most data will be

targeted in order to discourage more recreational file sharers. Thanks to

MonaLisaOverdri

In [None]:
data_spam_train, data_spam_test = train_test_split(data_spam, test_size = 0.2)
data_ham_train, data_ham_test = train_test_split(data_ham, test_size = 0.2)



In [None]:
data = data.append(getData("/content/drive/My Drive/emails/spam", 'spam'))
data = data.append(getData("/content/drive/My Drive/emails/ham", 'ham'))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

(a) Criminal Proceedings.--No criminal proceeding shall be maintained

under the provisions of this title unless it is commenced within five

years after the cause of action arose.



(b) Civil Actions.--No civil action shall be maintained under the

provisions of this title unless it is commenced within three years after

the claim accrued.



The penalties are too extensive to list here, but they can be found in

Section 2319: Criminal infringement of a copyright. In general, first-time

criminal offenses will carry a maximum prison sentence of 1 year.



I'm still not sure where the DOJ would start in choosing people to

prosecute because of the aforementioned "schooling" effect, but my guess

would be that, just like speeding, primarily the most prominent

individuals who operate large servers or transfer the most data will be

targeted in order to discourage more recreational file sharers. Thanks to

MonaLisaOverdri

In [None]:
# Join data
data_train = DataFrame({'message': [], 'class': []})
data_train = data_train.append(readFromData(data_spam_train))
data_train = data_train.append(readFromData(data_ham_train))

data_test = DataFrame({'message': [], 'class': []})
data_test = data_test.append(readFromData(data_spam_test))
data_test = data_test.append(readFromData(data_ham_test))

In [None]:
# Initialize classifier and vectorizer
classifier = MultinomialNB()  #Naive bayes function from above
vectorizer = CountVectorizer()  #feature extraction

In [None]:
# Train data
counts = vectorizer.fit_transform(data_train['message'].values)
targets = data_train['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# Predict Data and confusion matrix
predictions = classifier.predict(vectorizer.transform(data_test['message'].values))
cm = confusion_matrix(data_test['class'].values, predictions)
print("Confusion Matrix:")
print(cm)


# Accuracy
accuracy = accuracy_score(data_test['class'].values, predictions)
accuracy = accuracy*100
print("Accuracy : ",accuracy,"%")


Confusion Matrix:
[[499   1]
 [ 26  74]]
Accuracy :  95.5 %


In [None]:
# Operations on confusion matrix
def matrixcm(cm):
    matrix = []
    for ix,iy in np.ndindex(cm.shape):
        matrix.append(cm[ix,iy])
    return matrix

# calculating precision using confusion matrix
def precision(TP,FP):
    pr = (TP)/(TP+FP)
    return pr

# Calculating recall using confusion matrix
def recal(TP,FN):
    re = (TP)/(TP+FN)
    return re

# Calculating F1 score using confusion matrix
def f1scre(pr,re):
    f1 = 2*(re * pr) / (re + pr)  # 2*(Recall * Precision) / (Recall + Precision)
    return f1

# confusion matrix
predictions = classifier.predict(vectorizer.transform(data_test['message'].values))
cm = confusion_matrix(data_test['class'].values, predictions)

#l=matrixcm(cm)
l=[]
l=matrixcm(cm)
TN=l[0]   # True Negative
FP=l[1]   # False Positive
FN=l[2]   # False Negative
TP=l[3]   # True Positive


# Precision
precise = precision(TP,FP)

# Recall
sensitive = recal(TP,FN)

# F1 score
f1=f1scre(precise,sensitive)

# Accuracy
accuracy = accuracy_score(data_test['class'].values, predictions)
accuracy = accuracy*100



# Driver program

nmbrofmail=int(input("How many emails you wanna scan? : ")) #inputing number of emails

mails=[]
for i in range(nmbrofmail):
    mails.append(input("\nEnter mail : "))  #inputing emails for testing

predictions = classifier.predict(vectorizer.transform(mails))  #classifying whether an email is spam or not
print("\nClassification of emails in their sequence : ",predictions)
print("\n")
print("----------------------------------------------------")
print("\n")
print("Confusion Matrix:")
print(cm)
print("Accuracy : ",accuracy,"%")
print("Precision : ",precise)
print("Recall : ",sensitive)
print("F1 Score : ",f1)

How many emails you wanna scan? : 1

Enter mail : View this email in your browser   Log in to your site ›  WordPress.com Newsletter Your weekly dose of marketing tips,  tutorials, and inspiration!  Six ways to monetize your site Six Ways to Monetize Your Site  Looking to turn your website into a reliable source of income? Discover our top six ways you can start earning money from your fans and followers right away.  Discover ways to earn Alternate text Monetize Your Site  Access advanced monetization tools when you upgrade to a Premium, Business, or eCommerce plan!  View plans   Earn with simple payments   Collect One-Time and Recurring Payments   Create a reliable revenue stream from your visitors and fans. Powered by Stripe, you can collect one-time, monthly, or yearly credit and debit card payments and turn your site visitors into clients, customers, and financial supporters.  Earn with Simple Payments   Charge for access to exclusive content   Charge for Access to Exclusive Content