<a href="https://colab.research.google.com/github/rajiul123/Email_classification/blob/main/email_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import codecs

In [5]:
def read_in(folder):
  files = os.listdir(folder)
  a_list = []
  for a_file in files:
    with codecs.open(os.path.join(folder, a_file), "r", encoding="ISO-8859-1", errors="ignore") as f:
      a_list.append(f.read())

  return a_list

In [6]:
ham_list = read_in("ham/")
spam_list = read_in("spam/")

print(len(ham_list))
print(len(spam_list))

1000
760


In [7]:
import random

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]

random.seed(42)
random.shuffle(all_emails)

print(len(all_emails))

1760


In [12]:
import nltk
from nltk import word_tokenize
nltk.download("punkt_tab")

def get_features(text):
  features = {}
  word_list = [word for word in word_tokenize(text.lower())]
  for word in word_list:
    features[word] = True
  return features

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [15]:
all_features = [(get_features(email), label) for (email, label) in all_emails]
print(len(all_features))
print(len(all_features[0]))
print(len(all_features[0][0]))

1760
2
136


In [16]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
  train_set = features[:int(len(features)*proportion)]
  test_set = features[int(len(features)*proportion):]
  print(len(train_set))
  print(len(test_set))
  classifier = NaiveBayesClassifier.train(train_set)
  return train_set, test_set, classifier

In [17]:
train_set, test_set, classifier = train(all_features, 0.8)

1408
352


In [22]:
def evaluate(train_set, test_set, classifier):
  print(f"Train Accuracy: {classify.accuracy(classifier, train_set)}")
  print(f"Test Accuracy: {classify.accuracy(classifier, test_set)}")
  classifier.show_most_informative_features(50)

In [23]:
evaluate(train_set, test_set, classifier)

Train Accuracy: 0.9886363636363636
Test Accuracy: 0.9801136363636364
Most Informative Features
                      cc = True              ham : spam   =     74.0 : 1.0
                      pm = True              ham : spam   =     43.5 : 1.0
                  volume = True              ham : spam   =     41.9 : 1.0
                     ami = True              ham : spam   =     38.6 : 1.0
                 changes = True              ham : spam   =     37.9 : 1.0
                attached = True              ham : spam   =     36.1 : 1.0
               microsoft = True             spam : ham    =     35.4 : 1.0
                 windows = True             spam : ham    =     34.5 : 1.0
                pipeline = True              ham : spam   =     30.2 : 1.0
                   paste = True             spam : ham    =     30.2 : 1.0
                     gas = True              ham : spam   =     29.2 : 1.0
                  jackie = True              ham : spam   =     28.2 : 1.0
     

In [24]:
from nltk.text import Text

def concordance(text, search_word):
  for email in text:
    word_list = [word for word in word_tokenize(email.lower())]
    text_list = Text(word_list)
    if search_word in text_list:
      text_list.concordance(search_word)

In [25]:
print("STOCKS in HAM:")
concordance(ham_list, "stocks")

STOCKS in HAM:


In [26]:
print("STOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in SPAM:
Displaying 3 of 3 matches:
 statements . as with many microcap stocks , todays company has additional ris
blication pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this publication . 
Displaying 4 of 4 matches:
nt opportunity drummond , small cap stocks alert newsletter must read - alert 
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
 lose money from investing in penny stocks . - - - - - - - - - - - - - - - - -
Displaying 1 of 1 matches:
fessionally not multi - level - not stocks - not real estate no cost tele - se
Displaying 1 of 1 matches:
ecializing in undervalued small cap stocks for immediate breakout erhc and exx
Displaying 2 of 2 matches:
ng their gains . select gold mining stocks are the hot flyers of the otc . his
is letter cautions that micro - cap stocks are high - risk 

In [27]:
while True:
  email = input("Enter an email (or press ENTER button to quite): ")
  if len(email) == 0:
    break
  prediction = classifier.classify(get_features(email))
  print(f"This email is likely {prediction}")

Enter an email (or press ENTER button to quite): Font Awesome logo and wordmark in a neon style    Hey there,       We wanna make sure you don't miss a lifetime discount on Font Awesome 7. The pre-order sale ends next week!   
This email is likely spam
Enter an email (or press ENTER button to quite): Dear Concern,  Please take necessary initiative to create mentioned employee's E-sender & others access.
This email is likely ham
Enter an email (or press ENTER button to quite): 
