In [0]:
#Downgrading tensorflow in order to make BERT processing compatible
pip install tensorflow==1.1.0

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#Importing Dependencies
import numpy as np
import pandas as pd
import re
import random
import email
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn import metrics 

from sklearn.model_selection import train_test_split
%tensorflow_version 1.1.0
import tensorflow as tf

In [0]:
#Reading and preprocessing dataset
dataset = pd.read_csv('/content/drive/My Drive/ALDA_Project/emails.csv')

In [0]:
dataset_sent_mails = dataset[dataset['file'].str.contains('sent')]

In [0]:
dataset_sent_mails = dataset_sent_mails.assign(sender=dataset_sent_mails["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
dataset_sent_mails.drop("file", axis=1, inplace=True)
#print(dataset_sent_mails.head(5))
print(dataset_sent_mails["sender"].value_counts().head(15))


In [0]:
def email_preprocessing(email_message):
    msg = email.message_from_string(email_message)
    
    email_content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            email_content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(email_content)
    # msg["content"] = ''.join(email_content)
    return result

def content_preprocessing(content):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]

    return ' '.join(words)

In [0]:
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))

In [0]:
# Code for performing classification of all users
users = dataset_sent_mails["sender"].value_counts().index.values
mapping = {}
for i, user in enumerate(users):
  
  mapping[user] = i
sent_user_dataset = dataset_sent_mails
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))

In [0]:
final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
# final_data.head()
final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
final_data = final_data.replace({'user_number': mapping})


In [0]:
#Getting the required bert model
#Implemented Google's Pretrained Bert Model for email classification and vector generation
#Model Used: Bert-'uncased_L-12_H-768_A-12'
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

In [0]:
#Importing BERT dependencies
import modeling
import optimization
import run_classifier
import tokenization

In [0]:
import zipfile
import os
import tensorflow as tf
import datetime

In [0]:
#Splitting the dataset
X = final_data.content.values
y = final_data.user_number.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#Function to generate data for the BERT model
def create_examples(lines, set_type, labels=None):

    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '0'
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples


folder = 'model_folder'
with zipfile.ZipFile("uncased_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall(folder)

In [0]:
#Model Specification
BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = f'{folder}/uncased_L-12_H-768_A-12'
OUTPUT_DIR = f'{folder}/outputs'
print(f'>> Model output directory: {OUTPUT_DIR}')
print(f'>>  BERT pretrained directory: {BERT_PRETRAINED_DIR}')

In [0]:
# Model Hyper Parameters
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 1e-2
NUM_TRAIN_EPOCHS = 5.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 100
# Model configs
SAVE_CHECKPOINTS_STEPS = 100000 
ITERATIONS_PER_LOOP = 100000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

label_list = [str(num) for num in range(155)]
# label_list = [str(num) for num in range(15)]
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
train_examples = create_examples(X_train, 'train', labels=y_train)

tpu_cluster_resolver = None 
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available  
    use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available 
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

In [0]:
#Model training
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('>> Started training at {} '.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)


In [0]:
import gc
gc.collect()

In [0]:
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('>> Finished training at {}'.format(datetime.datetime.now()))

In [0]:
#Function for creating input closure to be passed to TPUEstimator
def input_fn_builder(features, seq_length, is_training, drop_remainder):

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

#Actual input function
  def input_fn(params):
    print(params)
    batch_size = 500

    num_examples = len(features)

    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [0]:
#Predicting the test dataset
predict_examples = create_examples(X_test_temp, 'test')
# print(predict_examples)
predict_features = run_classifier.convert_examples_to_features(
    predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

predict_input_fn = input_fn_builder(
    features=predict_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

result = estimator.predict(input_fn=predict_input_fn)

In [0]:
preds = []
for prediction in result:
      preds.append(np.argmax(prediction['probabilities']))

In [0]:
#Calculating the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy of BERT is:",accuracy_score(y_test,preds))