<a href="https://colab.research.google.com/github/samclein/Collab-Projects/blob/master/Email-Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a><img src="https://i.ibb.co/DRrztwp/Digital-Studio-logo-1.png" alt="Digital-Studio-logo-1" border="0"></a>
# **iXBRL Email Classification Tool**
When the DA&A iXBRL team send quotes to clients offering their services, they receive back hundreds of emails either accepting our quote or rejecting it.

Digital Studio is developing a tool that will automatically read these emails and decide if the client wants our services or not. This tool gets better with each new email it reads. Here's a few examples:

**Positive** i.e the client wants the service:
> "Yes, I will need your services again this year. Nadine can provide the financials to you when needed. Let me know if you have any questions for me."

> "Thank you for getting back to me, I can confirm we are happy to move forward with the filing. Please let me know if you require any further authorisation from us in order to proceed. Kind regards."



**Negative** i.e the client does not want the service:
> "I am afraid that the quote is just too much and we have an alternative supplier who can do it for much less and we won’t need PwC’s services on this. Kind regards."

> "Thank you for your email and the information. As you might know, Example Limited has changed auditors recently. We will therefore make sure that our new auditors take this into consideration."

**We need your help teaching our robot.**

All you have to do is type your own version of the above emails into the window below. Be as original as you like, but remember to stay true to the context. 

Once your responses are ready, start the machine (see below) and then hit the **Click to Classify!** button that will appear at the bottom of your screen.

Our robot will try and guess whether you (the client) accepts our services or not!

In [None]:
#@title On your keyboard press **Control + F9** to start the machine!
%%capture
#{ display-mode: "form" }
import os
import csv
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
from time import time
from sklearn.utils.extmath import density
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
!wget https://www.dropbox.com/s/dhwscexnysf2rfo/train_positive.txt
!wget https://www.dropbox.com/s/qt4l9dg1a73eccj/test_positive.txt
!wget https://www.dropbox.com/s/f1jlg7f2o9iew2y/real_negative.txt
!wget https://www.dropbox.com/s/3zt9n8mjobnmhfy/gen_neg.txt
email_pos = open("train_positive.txt", "r", encoding='latin-1').read() 
email_neg = open("gen_neg.txt", "r", encoding='latin-1').read()
email_pos_test = open("test_positive.txt", "r", encoding='latin-1').read()
email_neg_test = open("real_negative.txt", "r", encoding='latin-1').read()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()


def lemmatize_corpus(input_corpus, tag):
    output_corpus = []
    for email in input_corpus.split('\n'):
        words = nltk.word_tokenize(email)
        lemmatized_email = ""
        for word in words:
            part_of_speech = get_wordnet_pos(word)
            lemma = lemmatizer.lemmatize(word, part_of_speech)
            if lemma.isalpha():
                lemmatized_email = lemmatized_email + " " + lemma
        output_corpus.append((lemmatized_email, tag))
    return(output_corpus)    

pos_emails_lemma = lemmatize_corpus(email_pos, "positive")
neg_emails_lemma = lemmatize_corpus(email_neg, "negative")
pos_emails_test_lemma = lemmatize_corpus(email_pos_test, "positive")
neg_emails_test_lemma = lemmatize_corpus(email_neg_test, "negative")

results_out_csv = []
gp = 0.20
rp = 0.8

size_real = len(neg_emails_test_lemma)
num_real = int(rp*size_real)
size_gen = len(neg_emails_lemma)
num_gen = int(gp*size_gen)
neg_train = neg_emails_test_lemma[:num_real] +neg_emails_lemma[:num_gen]
neg_test = neg_emails_test_lemma[num_real:]
emails_array = np.array(pos_emails_lemma + neg_train)
emails_test_array = np.array(pos_emails_test_lemma + neg_test)
text_train, text_test, label_train, label_test = emails_array[:,0], emails_test_array[:,0], emails_array[:,1], emails_test_array[:,1]

vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english')
text_train_list = text_train.tolist()
text_test_list = text_test.tolist()
label_train_list = [1 if x == 'positive' else 0 for x in label_train.tolist()]
label_test_list = [1 if x == 'positive' else 0 for x in label_test.tolist()]
text_all = text_train_list + text_test_list
text_all_v = vectorizer.fit_transform(text_all)
features = vectorizer.get_feature_names()
text_train_v = text_all_v[:len(label_train_list), :]
text_test_v = text_all_v[len(label_train_list):, :]

saved_vectorizer = vectorizer

# Use Chi^2 test to remove features independent across classes?

ch2 = SelectKBest(chi2, k = int(np.floor(text_train_v.shape[1]*1)))
text_train_v = ch2.fit_transform(text_train_v, label_train_list)
text_test_v = ch2.transform(text_test_v)
features = [features[i] for i in ch2.get_support(indices=True)]
features = np.asarray(features)

#for alp in np.arange(0.1, 1.1, 0.1):
alp = 0.1
cnn = ComplementNB(alpha=alp)
cnn.fit(text_train_v, label_train_list)

In [None]:
#@title Type your custom emails: { run: "auto" }
Positive = "Yes that sounds good lets start." #@param {type:"string"}
Negative = "This service is outrageously priced. I have zero interest in what you are offering." #@param {type:"string"}
with open ("custom_pos.txt", "w+") as f:
  f.write (Positive);
with open ("custom_neg.txt", "w+") as f:
  f.write (Negative);
cust_pos = open("custom_pos.txt", "r", encoding='latin-1').read() 
cust_neg = open("custom_neg.txt", "r", encoding='latin-1').read()

#@markdown ### Click the 'Classify' button when it appears to see the machine's prediciton

import csv
import ipywidgets as widgets
from IPython.display import display, clear_output

button = widgets.Button(description="Click to Classify!")
output = widgets.Output()

def send_output():
    import smtplib
    import mimetypes
    from email.mime.multipart import MIMEMultipart
    from email import encoders
    from email.message import Message
    from email.mime.base import MIMEBase

    from email.mime.text import MIMEText

    emailfrom = "xbrlbot@gmail.com"
    emailto = "xbrlbot@gmail.com"
    fileToSend = "btm_output.csv"
    username = "xbrlbot@gmail.com"
    password = "*****"

    msg = MIMEMultipart()
    msg["From"] = emailfrom
    msg["To"] = emailto
    msg["Subject"] = "Beat The Machine Output"
    msg.preamble = "See attached; output from 'Beat the Machine'."

    ctype, encoding = mimetypes.guess_type(fileToSend)
    if ctype is None or encoding is not None:
        ctype = "application/octet-stream"

    maintype, subtype = ctype.split("/", 1)

    if maintype == "text":
        fp = open(fileToSend)
        # Note: we should handle calculating the charset
        attachment = MIMEText(fp.read(), _subtype=subtype)
        fp.close()
    else:
        fp = open(fileToSend, "rb")
        attachment = MIMEBase(maintype, subtype)
        attachment.set_payload(fp.read())
        fp.close()
        encoders.encode_base64(attachment)
    attachment.add_header("Content-Disposition", "attachment", filename=fileToSend)
    msg.attach(attachment)

    server = smtplib.SMTP("smtp.gmail.com:587")
    server.starttls()
    server.login(username,password)
    server.sendmail(emailfrom, emailto, msg.as_string())
    server.quit()

def on_button_clicked(b):
  
  # Display the message within the output widget.
  with output:

    pos_custom_lemma = lemmatize_corpus(cust_pos, "positive")
    neg_custom_lemma = lemmatize_corpus(cust_neg, "negative")
                
    custom_all= pos_custom_lemma + neg_custom_lemma
    custom_all_array = np.array(custom_all)

    custom_test, custom_labels = custom_all_array[:,0],custom_all_array[:,1]

    custom_test_list = custom_test.tolist()
    custom_labels_list = [1 if x == 'positive' else 0 for x in custom_labels.tolist()]
    text_all = custom_test_list

    saved_vectorizer = vectorizer

    text_all_v = saved_vectorizer.transform(text_all)
    custom_test_v = text_all_v

    pred = cnn.predict(custom_test_v)
    c_mat = metrics.confusion_matrix(custom_labels_list, pred)

    results = []

    Input = 'Input'
    Actual = 'Actual'
    Prediction = 'Prediction'
    Correct = 'Correct'
    results.append([Input, Actual, Prediction, Correct])

    # Test Naive Bayes classifiers
    clear_output()
    if c_mat[1,1] == 1:
      print('[Prediction]Positive: "'+ Positive + '" Correct!')
      Input = Positive
      Actual = 'Positive'
      Prediction = 'Positive'
      Correct = 'Yes'
      results.append([Input, Actual, Prediction, Correct])
    if c_mat[1,0] == 1:
      print('[Prediction]Negative: "'+ Positive + '" Incorrect :(')
      Input = Positive
      Actual = 'Positive'
      Prediction = 'Negative'
      Correct = 'No'
      results.append([Input, Actual, Prediction, Correct])
    if c_mat[0,1] == 1:
      print('[Prediction]Positive: "'+ Negative + '" Inorrect :(')
      Input = Negative
      Actual = 'Negative'
      Prediction = 'Positive'
      Correct = 'No'
      results.append([Input, Actual, Prediction, Correct])
    if c_mat[0,0] == 1:
      print('[Prediction]Negative: "'+ Negative + '" Correct!')
      Input = Negative
      Actual = 'Negative'
      Prediction = 'Negative'
      Correct = 'Yes'
      results.append([Input, Actual, Prediction, Correct])
    
    with open('btm_output.csv', 'w+') as csvfile:
      writer = csv.writer(csvfile)
      for row in results:
          row = list(row)
          writer.writerow(row)
    csvfile.close()
    # send_output()

button.on_click(on_button_clicked)
display(button, output)
