In [None]:
import random

def read_file(path):
    """
    read and return all data in a file
    """
    with open(path, 'r') as f:
        return f.read()

def load_data():
    # load all data from file
    data_path = "data/SpamDetectionData.txt"
    all_data = read_file(data_path)
    
    # split the data into lines, each line is a single sample
    all_lines = all_data.split('\n')

    # each line in the file is a sample and has the following format
    # it begins with either "Spam," or "Ham,", and follows by the actual text of the email
    # e.g. Spam,<p>His honeyed and land....
    
    # extract the feature (email text) and label (spam or ham) from each line
    features = []
    labels = []
    for line in all_lines:
        if line[0:4] == 'Spam':
            labels.append(1)
            features.append(line[5:])
            pass
        elif line[0:3] == 'Ham':
            labels.append(0)
            features.append(line[4:])
            pass
        else:
            # ignore markers, empty lines and other lines that aren't valid sample
            # print('ignore: "{}"'.format(line));
            pass
    
    return features, labels
    
features, labels = load_data()

print("total no. of samples: {}".format(len(labels)))
print("total no. of spam samples: {}".format(labels.count(1)))
print("total no. of ham samples: {}".format(labels.count(0)))

print("\nPrint a random sample for inspection:")
random_idx = random.randint(0, len(labels))
print("example feature: {}".format(features[random_idx][0:]))
print("example label: {} ({})".format(labels[random_idx], 'spam' if labels[random_idx] else 'ham'))

In [None]:
from sklearn.model_selection import train_test_split

# load features and labels
features, labels = load_data()

# split data into training / test sets
features_train, features_test, labels_train, labels_test = train_test_split(
    features, 
    labels, 
    test_size=0.2,   # use 10% for testing
    random_state=42)

print("no. of train features: {}".format(len(features_train)))
print("no. of train labels: {}".format(len(labels_train)))
print("no. of test features: {}".format(len(features_test)))
print("no. of test labels: {}".format(len(labels_test)))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize email text into tfidf matrix
# TfidfVectorizer converts collection of raw documents to a matrix of TF-IDF features.
# It's equivalent to CountVectorizer followed by TfidfTransformer.
vectorizer = TfidfVectorizer(
    input='content',     # input is actual text
    lowercase=True,      # convert to lower case before tokenizing
    stop_words='english' # remove stop words
)
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed  = vectorizer.transform(features_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
import pickle

def save(vectorizer, classifier):
    '''
    save classifier to disk
    '''
    with open('model.pkl', 'wb') as file:
        pickle.dump((vectorizer, classifier), file)
        
def load():
    '''
    load classifier from disk
    '''
    with open('model.pkl', 'rb') as file:
      vectorizer, classifier = pickle.load(file)
    return vectorizer, classifier

# train a classifier
classifier = MultinomialNB()
classifier.fit(features_train_transformed, labels_train)

# save the trained model
save(vectorizer, classifier)

# score the classifier accuracy
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, labels_test) * 100))

In [None]:
import numpy as np
from sklearn import metrics
prediction = classifier.predict(features_test_transformed)
fscore = metrics.f1_score(labels_test, prediction, average='macro')
print("F score {:.2f}".format(fscore))

In [None]:
from __future__ import print_function

import os.path
import base64
import re

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [None]:
def main():
    """Shows basic usage of the Gmail API.
    Lists the user's Gmail labels.
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    try:
        service = build('gmail', 'v1', credentials=creds)
        result = service.users().messages().list(userId='me').execute()

        messages = result.get('messages')

        for msg in messages:

            txt = service.users().messages().get(userId='me', id=msg['id']).execute()
            try:

                payload = txt['payload']
                headers = payload['headers']

                for d in headers:
                    if d['name'] == 'Subject':
                        subject = d['value']
                    if d['name'] == 'From':
                        sender = d['value']

                parts = payload.get('parts')[0]
                data = parts['body']['data']
                data = data.replace("-", "+").replace("_", "/")
                text = base64.b64decode(data)

                text = text.decode('utf-8')
                text = re.sub(r'[\r\n\xe2\x80\x99]', ' ', text)

                print("Subject: ", subject)
                print("From: ", sender)
                print("Message: ", text)
                print('\n')
            except:
                pass


    except HttpError as error:
        # TODO(developer) - Handle errors from gmail API.
        print(f'An error occurred: {error}')

In [None]:
vectorizer, classifer = load()

print('\nPerform a test')                    
#email_input = 'enter your email here'
if __name__ == '__main__':
   email_input = main()
   
email_input_transformed = vectorizer.transform(email_input)
prediction = classifer.predict(email_input_transformed)

print('EMAIL:', email_input)
print('The email is', 'SPAM' if prediction else 'HAM')