BECCA Classifier
by Shannon Hamilton, Shrestha Mohanty, Steve Trush<br>
for INFO 256, Fall 2016, taught by Prof. Marti Hearst
<pre>
Please ensure you run this notebook in a folder with the following files:
    emailgraph.py (depends on Stanford NER package - update this file with parameters for your NER installation!)
        - also install py2neo
    emotion.py (source: Clement Michard (c) 2015 https://github.com/clemtoy/WNAffect)
    wnaffect.py (source: Clement Michard (c) 2015 https://github.com/clemtoy/WNAffect)
 Folders:
    hillary-clinton-emails
    wordnet-1.6  (see https://wordnet.princeton.edu/wordnet/download/)
    wn-domains-3.2 (see http://wndomains.fbk.eu/wnaffect.html)

Ensure you have installed the following packages:
nltk\vader_lexicon (run nltk.download())

Install Neo4j (https://neo4j.com/download/community-edition/) and run the graph database server.
</pre>

Much of our strategy for classifying emotional tone <br>
leans on the framework described in the following paper: <br>
<b>Identifying Emotional Expressions, Intensities and Sentence level
Emotion Tags using a Supervised Framework*</b><br>
Dipankar Das and Sivaji Bandyopadhyay
https://pdfs.semanticscholar.org/02e1/cd141356cd3ea072179f9e9319f28d013061.pdf


In [1]:
#imports - the basics
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import string
import re
import math

#sklearn... commented out were experiments
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import precision_recall_curve
#from sklearn.linear_model import SGDClassifier
#from sklearn.multiclass import OneVsOneClassifier
#from sklearn.feature_selection import SelectFromModel
#from sklearn.model_selection import GridSearchCV
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.feature_selection import RFE
#from sklearn.ensemble import RandomForestClassifier, VotingClassifier

#nltk
import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#pandas
import pandas as pd
from pandas import DataFrame

#WordNet-Affect
from wnaffect import WNAffect

In [2]:
%load_ext autoreload
%autoreload 2
#EmailGraph - written by us!
import emailgraph
from emailgraph import EmailGraph

## Get Emails! 

In [3]:
#read_file is a helper function to get the ',' delimited CSV into a data frame
def read_file(filename):
    #get the file
    df = pd.read_csv(filename, low_memory=False, delimiter = ',', encoding="ISO-8859-1")
    #filter the null data
    filtered_data = df["RawText"].notnull()
    df_narrative = df[filtered_data]
    return df_narrative


In [4]:
#get_all_tagged_emails (consolidates all tagged emails into on frame)
#emails were tagged in 6 bunches to ensure coder reliability
def get_all_tagged_emails():
    print('Getting Email files...')
    df1 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out1.csv')
    df2 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out2.csv')
    df3 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out3.csv')
    df4 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out2set2.csv')
    df5 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out3set2.csv')
    df6 = read_file('.\\hillary-clinton-emails\\tagged-mail\\email_out1set2.csv')
    
    frames = [df1, df2, df3, df4, df5, df6]
    total_df = pd.concat(frames)
    total_df.reset_index(drop=True, inplace=True)

    #randomize the emails
    random_index = np.random.permutation(total_df.index)
    df_narrative_shuffled = total_df.ix[random_index]
    df_narrative_shuffled.reset_index(drop=True, inplace=True)

    print("Total tagged emails: "+str(df_narrative_shuffled.shape[0]))
    
    #make sure any emotion emails come up as 1
    def normalize(x):
        if x > 0:
            return 1
        return 0
    df_narrative_shuffled['Label'] = df_narrative_shuffled['Label'].apply(normalize)
    
    return df_narrative_shuffled

### Create Train, Dev and Test Sets

In [5]:
#create the data sets (specify the complete frame and percentages of data for train, dev, and test sets)
def create_sets(total_df, train_pct, dev_pct, test_pct):
    #we want to ensure equal amounts of emotional data due to high imbalance
    df_emo = total_df.loc[total_df['Label'] == 1]
    rowe , columne  = df_emo.shape
    df_neu = total_df.loc[total_df['Label'] == 0]
    rown , columnn  = df_neu.shape

    #get number of rows per set
    train_size_emo = round(rowe*train_pct)
    dev_size_emo = round(rowe*dev_pct)
    test_size_emo = round(rowe*test_pct)

    train_size_neu = round(rown*train_pct)
    dev_size_neu = round(rown*dev_pct)
    test_size_neu = round(rown*test_pct)

    #get training set
    df_train_emo = df_emo[0:train_size_emo-1]
    df_train_neu = df_neu[0:train_size_neu-1]
    df_train = pd.concat([df_train_emo,df_train_neu])
    print("Size of training set: "+str(df_train.shape))
    
    #get dev set
    df_dev_emo = df_emo[train_size_emo:(train_size_emo+dev_size_emo)-1].reset_index(drop=True)
    df_dev_neu = df_neu[train_size_neu:(train_size_neu+dev_size_neu)-1].reset_index(drop=True)
    df_dev = pd.concat([df_dev_emo,df_dev_neu])
    print("Size of dev set: "+str(df_dev.shape))

    #get test set
    df_test_emo = df_emo[dev_size_emo+train_size_emo:]
    df_test_neu = df_neu[dev_size_neu+train_size_neu:]
    df_test = pd.concat([df_test_emo,df_test_neu])
    print("Size of test set: "+str(df_test.shape))
    
    return df_train, df_dev, df_test

In [6]:
#create_datasets gets the emails and separates dataframes by percentage
def create_datasets(train_pct,dev_pct,test_pct):
    total_df = get_all_tagged_emails()
    return create_sets(total_df, train_pct, dev_pct, test_pct)


### Tokenize and Tag

In [7]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")
legal_words = {"section","fw","re","ops","fyi","doc no","case no","case","usc","foia","u.s.c",\
               "report","attachment","attachments","note","amended", "ebook","subject","unclassified department of state case","doc",\
               "unclassified","original message","project", "copyright", "pls", "pis","state"}

# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    tokens = []
    processed = ""
    
    #filter out some of the markup lines of text
    for line in text.split('\n'):
        s = line.lower()
        if s.startswith("unclassified u.s. department of state") or \
            s.startswith("release in"):
                 pass
        else:
            processed = processed + line + '\n'

    #tokenize the sentences        
    sents = [s for s in nltk.sent_tokenize(processed)]
    for s in sents:
        #get word tokens for words that are important (not 'legal words')
        tokens = tokens + [word for word in nltk.word_tokenize(s) if word not in legal_words]
    filtered_tokens = []
 
    # filter out any tokens containing numbers and ensure at least some letters
    for token in tokens:
        if not re.search('[0-9]', token):
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
    #return stems
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [8]:
#this is from INFO 256 Marti Hearst's POS Tagger exercise
def create_data_sets(sentences):
    size = int(len(sentences) * 0.9)     #This keeps 10% of the sentences as test data.
    train_sents = sentences[:size]
    test_sents = sentences[size:]
    return train_sents, test_sents

#this is from INFO 256 Marti Hearst's POS Tagger exercise
def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('NN')       #I changed the default to Proper Noun
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

#train the tagger with some specific sentences (if needed)
def train_tagger_with_more():
    my_sents = []
    tagged_sents = nltk.corpus.nps_chat.tagged_posts() #use the chat corpus for 'informal' speak
    return train_tagger(my_sents + tagged_sents)

#this is from INFO 256 Marti Hearst's POS Tagger exercise
def train_tagger(already_tagged_sents):
    train_sents, test_sents = create_data_sets(already_tagged_sents)
    ngram_tagger = build_backoff_tagger(train_sents)
    print ("%0.3f pos accuracy on test set" % ngram_tagger.evaluate(test_sents))
    return ngram_tagger



In [9]:
chat_tagger = train_tagger_with_more() #train an informal speech tagger

#This is from Clement Michard (c) 2015
#https://github.com/clemtoy/WNAffect
wna = WNAffect('wordnet-1.6/', 'wn-domains-3.2/') #create a wordnet affect dictionary

0.858 pos accuracy on test set


In [10]:
# here I define a tokenizer, do POS tagging, and to look for emotion words
def tokenize_and_emote(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = []
    processed = ""
    for line in text.split('\n'):
        s = line.lower()
        #filters the header information in the emails
        if s.startswith("unclassified u.s. department of state") or \
            s.startswith("release in") or \
            s.startswith("original message") or \
            s.startswith("to:") or \
            s.startswith("from:") or \
            s.startswith("sent:") or  \
            s.startswith("cc:"):
                 pass
        else:
            processed = processed + line + '\n'
    sents = [s for s in nltk.sent_tokenize(processed)]
    for s in sents:
        #POS tag the tokens!
        tokens = tokens + [word for word in chat_tagger.tag(nltk.word_tokenize(s))] 

    filtered_tokens = []
    
    # filter out numbers and then get emotion category from WordNetAffect
    for token in tokens:
        if not re.search('[0-9]', token[0]):
            if re.search('[a-zA-Z]', token[0]):
                filtered_tokens.append(wna.get_emotion(token[0].lower(),token[1]))
    
    emotions = [str(t) for t in filtered_tokens if t is not None]
    #print(stems)
    return emotions

## Custom Transformers!

In [11]:
#remove_articles: looks for the start of news articles and strips any text that follows
def remove_articles(text):
    m = re.search(r"\(Reuters\) \-|\(AP\) \-|\(Associated Press\) \-|http\:\\\\", text)
    if m is not None:
        return text[:m.start(0)]
    else:
        return text

#how_emo - logarithmic transform of emotional weights
def how_emo(x):
    if int(x * 100) > 0:
        return int(math.log((x * 100)+1, 2)) + 1
    else:
        return 0
    
#simply counts the number of linebreaks in an email
def count_lines(text):
    return len(text.split('\n'))

#LengthTransformer: Feature is the log_10 of the number of characters in an email
class LengthTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        #print(X)
        lines = DataFrame(X.apply(lambda x: int(math.log(len(x), 10))))
        #print(lines)
        return lines

    def fit(self, X, y=None, **fit_params):
        return self

#count_puncts: returns the number of expressive punctuation and intensifiers in a text
def count_puncts(text):
    m = re.findall(r"(?:[\!])|(?:very)|(?:so much)|(?:\.\.\.)|(?:thx)|(?:thank)",text.lower())
    #m = re.findall(r"(?:[\!][\!])",text)
    if m is not None and len(m) > 0:
        return len(m) 
    else:
        return 0    

#PuncTransformer: Feature is the number of expressive punctuation and intensifiers per line of text
#The scores are adjusted by a logarithmic function
#Emails are stripped of news article forwards
class PuncTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lines = DataFrame(X.apply(lambda x: how_emo(count_puncts(remove_articles(x))/\
                                                    count_lines(remove_articles(x)))))
        #print(lines)
        return lines

    def fit(self, X, y=None, **fit_params):
        return self

#https://medium.com/@aneesha/quick-social-media-sentiment-analysis-with-vader-da44951e4116#.okgv4siow
#https://pypi.python.org/pypi/vaderSentiment
#https://github.com/cjhutto/vaderSentiment/issues/5
#http://www.nltk.org/howto/sentiment.html
class SentimentTransformer(TransformerMixin):
    
    #assigns sentiment into 4 buckets based on intensity
    sid = SentimentIntensityAnalyzer()
    def get_sentiment(self, text):
        result = self.sid.polarity_scores(text[0:500]) 
        #print(result)
        if result['neu'] > .9 :
            return 0
        elif result['neu'] > .8 :
            return 1
        elif result['neu'] > .7 :
            return 2
        else:
            return 3
    
    #Feature: a score of 0 to 3 based on the degree of +/- sentiment
    def transform(self, X, **transform_params):
        lines = DataFrame(X.apply(lambda x: self.get_sentiment(remove_articles(x))))
        #print(lines)
        return lines

    def fit(self, X, y=None, **fit_params):
        return self

#EmoTransformer: Feature is the number of 'emotions' in an email per line of text
#The scores are adjusted by a logarithmic function
#Emails are stripped of news article forwards    
class EmoTransformer(TransformerMixin):  
    def transform(self, X, **transform_params):       
        lines = DataFrame(X.apply(lambda x: \
                how_emo(len(tokenize_and_emote(remove_articles(x)))/count_lines(remove_articles(x)))))
        #print('emotions' + str(lines))
        return lines
    
    def fit(self, X, y=None, **fit_params):
        return self

In [12]:
#basic gridsearch based on:
#http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
def perform_gridsearch(clf, df_train, df_dev):
    parameters = {
                    'vect__max_df': (0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    #                 'clf__penalty': ('l2', 'elasticnet')
    #                 'vect__max_features': (None, 5000, 10000, 50000),
                   'tfidf__smooth_idf': (True, False),
    #                'tfidf__sublinear_idf': (True, False),
                    'clf__penalty': ('l1', 'l2', 'elasticnet'),
    #                 'clf__dual': (True, False),
    #                 'clf__loss': ('hinge','squared_hinge')
     }

    gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
    gs_clf = gs_clf.fit(df_train["RawText"], df_train.Label)
    predicted = gs_clf.predict(df_dev["RawText"])
    accuracy_score(df_dev.Label, predicted)
    
    print("Best score: %0.3f" % gs_clf.best_score_)
    print("Best parameters set:")
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
       print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Pipeline!

In [13]:

#Code derived from INFO256 notebook: Classification with Scikit-Learn
#By John Semerdjian, Andrea Gagliano, and Marti Hearst
#Pipeline + FeatureUnion and custom transformers is
#built from http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

def train_classifier(df_train, df_dev):
    
#Comments are other classifiers used
#     clf1 = LogisticRegression(random_state=1)
#     clf2 = SGDClassifier(loss='log',alpha=1e-4, penalty='l2', n_iter=50, random_state=69)
#     clf3 = MultinomialNB()
    text_clf = Pipeline([
      ('features', FeatureUnion([
        ('ngram_tf_idf', Pipeline([
            #Bag of words/Bigrams, counts/TFIDF as features
            ('vect', CountVectorizer(ngram_range=(1,2), min_df = .2, max_df = .75, tokenizer=tokenize_and_stem, stop_words='english')),
            ('tfidf', TfidfTransformer(use_idf=True,norm='l1'))
        ])),
         ('email_length', LengthTransformer()),
         ('jubilant', PuncTransformer()),
         ('sentiment', SentimentTransformer()),
         ('emotions', EmoTransformer())
      ])),
#         ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
#         ('clf', RandomForestClassifier(class_weight={1:5}))
        ('clf', LinearSVC(C=1, random_state=42, penalty='l2', dual=True, tol=1e-5, class_weight='balanced'))
#         ('clf', SGDClassifier(loss='hinge', alpha=1e-4, penalty='l2', n_iter=50, random_state=69, class_weight={1:5}))
#         ('clf',VotingClassifier(estimators=[('lr', clf1), ('sgdc', clf2), ('lsvc', clf3)], voting='hard'))
    ])
    text_clf.fit(df_train["RawText"], df_train.Label)
    predicted = text_clf.predict(df_dev["RawText"])
    #perform_gridsearch(text_clf, df_train, df_dev)
    
    print(accuracy_score(df_dev.Label, predicted))
    print(pd.crosstab(df_dev.Label, predicted, rownames=['True'], colnames=['Predicted'], margins=True))
    print('Yes!')
    return text_clf

We tried precision/recall adjustments as we were comfortable with more false positives (emails being incorrectly classified as having an emotional tone), but wanted most emotional emails classified correctly.

In [14]:
#http://stats.stackexchange.com/questions/140266/classifier-with-adjustable-precision-vs-recall
def increase_recall(clf, df_dev):
    y_score = clf.decision_function(df_dev["RawText"])
    prcsn,rcl,thrshld=precision_recall_curve(df_dev.Label,y_score)
    min_prcsn=.25 # here is your precision lower bound e.g. 25%
    min_thrshld=min([thrshld[i] for i in range(len(thrshld)) if prcsn[i]>min_prcsn])
    y_pred_adjusted=[1 if y_s>min_thrshld else 0 for y_s in y_score]
    new_preds = np.array(list([1 if y_pred_adjusted[i] < 1 else predicted[i] for i in range(0,len(predicted))]))
    return new_preds

In [15]:
#print_error
#Originally Coded by Avi Dixit for INFO256
#Prints emotional emails if incorrectly categorised by the predictor to be able to fine tune the classifier

def print_errors(df_dev, predicted):
    devs = df_dev[["RawText","Label"]].values.tolist()
    predict = list(predicted)

    categories = ["Neutral","Emotion"]
    ind = 0
    for (z,y) in zip(devs, predict):
        if ind > 10:
            break
        if (str(z[1]) == str(y) or (int(z[1]) == 0)):
            continue
        else:
            ind += 1
            print("Predicted is {}".format(categories[int(y)]))
            print("Actual is {}".format(categories[int(z[1])]))
            print("Sentence:  {} \n". format(z[0]))
    #print("Total incorrect are {}".format(ind))

### Classifying the rest of our data (getting emotions too)

In [16]:
#classify_data, classifies a given dataframe
def classify_data(email_classifier, df):
    predicted = email_classifier.predict(df["RawText"])
    print(accuracy_score(df.Label, predicted))
    print(pd.crosstab(df.Label, predicted, rownames=['True'], colnames=['Predicted'], margins=True))

    #stores predicted labels in a new column
    df["NewLabel"] = predicted
    data_list = df["RawText"].values.tolist()
    
    #get the emotion words too
    df["Emotions"] = [0] * len(data_list)
    df.reset_index(drop=True, inplace=True)
    for index in range(0, len(data_list)):
         df["Emotions"].loc[index] = ";".join([w for w in (set(tokenize_and_emote(data_list[index])))])
    return df


## Store the Emails in a Graph!

In [17]:
#creates a graph and adds a classified data frame
def create_graph(df):
    graph = EmailGraph("neo4j","becca")        #the username , password pair
    graph.delete()                             #wipes the graph clean
    graph.add_new_emails(df.shape[0], df)      #adds the new emails
    return graph

## Putting it all together (this function runs the show)

In [18]:
# Uncomment the create_graph part once you're ready with Neo4j!
def final_algorithm():
    train_df, dev_df, test_df = create_datasets(.8,.1,.1)
    email_classifier = train_classifier(train_df, dev_df)
    result_df = classify_data(email_classifier, test_df)
    #create_graph(result_df)
    return
final_algorithm()

Getting Email files...
Total tagged emails: 999
Size of training set: (797, 12)
Size of dev set: (98, 12)
Size of test set: (100, 12)
0.785714285714
Predicted   0   1  All
True                  
0          59  17   76
1           4  18   22
All        63  35   98
Yes!
0.79
Predicted   0   1  All
True                  
0          63  15   78
1           6  16   22
All        69  31  100


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### Here are some sample Neo4J queries...