In [52]:
import pandas as pd
import re
import email
from generate_data import generate_train_and_test_data
import pandas as pd
import math
import numpy as np

#### these are the 10 people whose writing styles we will analyze

In [112]:
#filter_list=['kay.mann@enron.com','vince.kaminski@enron.com','jeff.dasovich@enron.com',
#                 'chris.germany@enron.com','sara.shackleton@enron.com','tana.jones@enron.com',
#                'eric.bass@enron.com','matthew.lenhart@enron.com','kate.symes@enron.com','sally.beck@enron.com']
filter_list=['kay.mann@enron.com']

#### the enron emails often contain forwarded messages, or the orginal text in a reply to email. we remove all this data since we are trying to understand a particular person's writing style.
#### we also split the data into train and test. the test data is what we will try and predict on.

In [113]:
df,df_train,df_test=generate_train_and_test_data('data/emails.csv',filter_list,min_words=0)

In [114]:
all_data=df.pivot_table(index="From",values='NumWords',aggfunc="mean")
train_data=df_train.pivot_table(index="From",values='NumWords',aggfunc='mean')
test_data=df_test.pivot_table(index="From",values='NumWords',aggfunc='mean')
df_summary=all_data.join(train_data,lsuffix='All',rsuffix='Train')
df_summary=df_summary.join(test_data,lsuffix='',rsuffix='Test')
df_summary.columns=['All','Train','Test']
df_summary

Unnamed: 0_level_0,All,Train,Test
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
kay.mann@enron.com,35.149455,35.082358,35.752754


In [115]:
all_data=df.pivot_table(index="From",values='MessageLength',aggfunc="mean")
train_data=df_train.pivot_table(index="From",values='MessageLength',aggfunc='mean')
test_data=df_test.pivot_table(index="From",values='MessageLength',aggfunc='mean')
df_summary=all_data.join(train_data,lsuffix='All',rsuffix='Train')
df_summary=df_summary.join(test_data,lsuffix='',rsuffix='Test')
df_summary.columns=['All','Train','Test']
df_summary

Unnamed: 0_level_0,All,Train,Test
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
kay.mann@enron.com,202.101188,201.37939,208.591187


#### take all the text in the training set, and count the total number of words
#### then take the text for each author and count the total number of words by author
#### finally we take the n_most_frequent words 

In [116]:
from nltk.tokenize import sent_tokenize,word_tokenize
from collections import defaultdict
import nltk

#this is the most frequent count
n_most_frequent=50

#define a dictionary to hold the word count for each author
author_subcorpus_count={}
for item in filter_list:
    author_subcorpus_count[item]=defaultdict(int)


all_text=df_train['FormattedMessage'].tolist()
from_list=df_train['From'].tolist()

#word counts for the combined corpus
word_counts=defaultdict(int)

#go through the entire corpus, count words for the combined corpus and for each author
for i,text in enumerate(all_text):
    sentences=sent_tokenize(text.lower())
    for sentence in sentences:
        words=word_tokenize(sentence)
        fdist = nltk.FreqDist(words)
        for word in fdist:
            word_counts[word]+=fdist[word]
            author_subcorpus_count[from_list[i]][word]+=fdist[word]
            
#create a list of most frequent words
#also check what total word count is (to validate data)
freq_list=[]
i=0
totalWords=0
for w in word_counts:
    totalWords+=word_counts[w]
for w in sorted(word_counts, key=word_counts.get, reverse=True):
    if i<n_most_frequent: freq_list.append((w,word_counts[w]))
    i+=1
totalWords

310134

In [117]:
freq_list

[('.', 16543),
 (',', 13414),
 ('the', 12620),
 ('i', 9368),
 ('to', 8849),
 ('a', 4945),
 ('of', 4383),
 ('kay', 4313),
 ('and', 4142),
 ('you', 3851),
 ('is', 3729),
 ('it', 3291),
 ('in', 3141),
 ('for', 2963),
 ('?', 2929),
 ('we', 2605),
 ('that', 2578),
 ('be', 2572),
 ('have', 2513),
 ('this', 2489),
 ('on', 2365),
 ("n't", 2150),
 ('with', 2096),
 (')', 1933),
 ('(', 1917),
 ('thanks', 1703),
 ('will', 1594),
 ("'s", 1592),
 ('if', 1555),
 ('do', 1526),
 ('are', 1454),
 ("'m", 1392),
 ('as', 1364),
 ('me', 1360),
 ('at', 1219),
 ('or', 1215),
 ('can', 1158),
 ('--', 1149),
 ('would', 1112),
 ('so', 1081),
 ('but', 1071),
 ('from', 1036),
 ('not', 985),
 ('!', 951),
 ('know', 940),
 ('my', 934),
 ('please', 885),
 ('was', 878),
 ('get', 851),
 ('hi', 848)]

In [118]:
#aggregate total words by author
#ensure it adds up to total words by corpus
totalWordsByAuthor={}
totalWords=0
for author in author_subcorpus_count:
    totalWordsByAuthor[author]=sum(author_subcorpus_count[author][x] for x in author_subcorpus_count[author])
    totalWords+=totalWordsByAuthor[author]

#we compute the mean for the corpus 2 ways
#by corpus - so for example count "the" in the entire corpus/ total words in corpus
#or compute the prob of "the" in each author's corpus and average it
#the 2 results are not that different
frequentWordsCorpusMean={}
frequentWordsCorpusStdDev={}
for word,count in freq_list:
    frequentWordsCorpusMean[word]=(count+0.000001)/totalWords
    frequentWordsCorpusStdDev[word]=0.0


topWordsByAuthor={}
for item in author_subcorpus_count:
    topWordsByAuthor[item]={}
    for word,count in freq_list:
        wc=author_subcorpus_count[item][word]
        wp=(wc+0.000001)/totalWordsByAuthor[item]
        topWordsByAuthor[item][word]=wp
        
frequentWordsMean={}
for word,count in freq_list:
    frequentWordsMean[word]=0.0
    for author in topWordsByAuthor:
        frequentWordsMean[word]+=topWordsByAuthor[author][word]
    frequentWordsMean[word]/=len(topWordsByAuthor)

for word,count in freq_list:
    for author in topWordsByAuthor:
        diff=topWordsByAuthor[author][word]-frequentWordsCorpusMean[word]
        frequentWordsCorpusStdDev[word]+=diff*diff
    frequentWordsCorpusStdDev[word]/=len(topWordsByAuthor)
    frequentWordsCorpusStdDev[word]=math.sqrt(frequentWordsCorpusStdDev[word])
    
#print(frequentWordsCorpusMean)
print(totalWords)

310134


In [119]:
#calculate zscores
#for each author, calculate the zscore for each of the common words
zScoresByAuthor={}
for author in topWordsByAuthor:
    zScoresByAuthor[author]={}
    for word in frequentWordsCorpusMean:
        zScoresByAuthor[author][word]=(topWordsByAuthor[author][word]-frequentWordsCorpusMean[word])/frequentWordsCorpusStdDev[word]

ZeroDivisionError: float division by zero

In [None]:
#this function calculates the zscore for the test text
#it takes the text and counts the probabilities for common words
#and uses the frequentWordsCorpusMean and frequentWordsCorpusStdDev
def calc_z_score(text,frequentWordsCorpusMean,frequentWordsCorpusStdDev):
    word_counts=defaultdict(int)
    totalWords=0
    sentences=sent_tokenize(text.lower())
    for sentence in sentences:
        words=word_tokenize(sentence)
        fdist = nltk.FreqDist(words)
        for word in frequentWordsCorpusMean:
            if word in fdist:
                word_counts[word]+=fdist[word]
            else:
                word_counts[word]+=0
            totalWords+=fdist[word]
    zScores={}
    for word in word_counts:
        word_dist=(word_counts[word]+0.000001)/(totalWords+0.000001)
        zScores[word]=(word_dist-frequentWordsCorpusMean[word])/frequentWordsCorpusStdDev[word]
    return zScores

In [None]:
def find_email_match(text,frequentWordsCorpusMean,frequentWordsCorpusStdDev):
    scores={}
    min_score=1000000
    min_name=''
    zscores=calc_z_score(text,frequentWordsCorpusMean,frequentWordsCorpusStdDev)
    for author in zScoresByAuthor:
        score=0.0
        for word in zScoresByAuthor[author]:
            score+=abs(zscores[word]-zScoresByAuthor[author][word])
        score/=len(zScoresByAuthor[author])
        scores[author]=score
        if score<min_score:
            min_score=score
            min_name=author
    return min_name,min_score,scores

In [None]:
i=1000
text=df_test.iloc[i]['FormattedMessage']
name=df_test.iloc[i]['From']
print(name)
print(text)
author,min_score,scores=find_email_match(text,frequentWordsCorpusMean,frequentWordsCorpusStdDev)
print(author,min_score)
print(scores)

In [None]:
def softmax(x):
    e_x=np.exp(x-np.max(x))
    out=e_x/e_x.sum()
    return out

def cross_entropy_loss(predictions, targets, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray        
    Returns: scalar
    """
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions+1e-9))/N
    return ce

def get_probs(text,encoded_classes,frequentWordsCorpusMean,frequentWordsCorpusStdDev):
    zscores=calc_z_score(text,frequentWordsCorpusMean,frequentWordsCorpusStdDev)
    returnMatrix=[0.0 for author in zScoresByAuthor]
    for author in zScoresByAuthor:
        score=0.0
        for word in zScoresByAuthor[author]:
            score+=abs(zscores[word]-zScoresByAuthor[author][word])
        score/=len(zScoresByAuthor[author])
        for i,item in enumerate(encoded_classes):
            if item==author: returnMatrix[i]=score
    returnMatrix=[-x for x in returnMatrix]
    return softmax(returnMatrix),returnMatrix

In [None]:
from sklearn.preprocessing import LabelBinarizer
enc=LabelBinarizer()
enc.fit(filter_list)
print(enc.classes_)
all_text=df_test['FormattedMessage'].tolist()
from_list=df_test['From'].tolist()
y_values=enc.transform(from_list)
y_pred=[]
for text in all_text:
    prob,blah=get_probs(text,enc.classes_,frequentWordsCorpusMean,frequentWordsCorpusStdDev)
    y_pred.append(prob)
    
y_pred=np.array(y_pred)

In [None]:
i=2
print(from_list[i])
print(all_text[i])
print(enc.inverse_transform(y_values)[i])
print(y_pred[i])
print(enc.classes_[np.argmax(y_pred[i])])

In [None]:
print(cross_entropy_loss(y_values,y_pred))