# Test Project: Import emails exported from outlook via csv into DOTCE, classify

## Step 1: Go to correct directory and activate the DOTCE virtual env

%cd c:\Users\embicks\dotce
%pwd
! activate dotce
! conda info --envs

## Step 2: Load various machine learning libraries to extract features from text document corpus and build classification models

In [48]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('C:\\Users\\embicks\\AppData\\Local\\Continuum\\Anaconda3\\envs\\dotce\\lib\\site-packages\\pyLDAvis')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import MultinomialNB

## Step 3: Read csv file into df and split into training and test sets

In [49]:
#Make sure we're in the right directory
dirname = r'C:/Users/alasseter/Documents/Projects/CTxE/Federal Register/data/'
df = pd.read_csv(dirname + 'fedreg10k.csv')

In [50]:
df.shape

(10000, 17)

In [51]:
df.head(2)

Unnamed: 0,document_number,publication_date,agency_id,agency,subagency,title,abstract,type,excerpts,raw_name,slug,url,html_url,pdf_url,public_inspection_pdf_url,json_url,parent_id
0,2018-10583,2018-05-22,492,Transportation Department,Federal Aviation Administration,Airworthiness Directives; DG Flugzeugbau GmbH ...,We are superseding Airworthiness Directive (AD...,Rule,We are superseding Airworthiness Directive (AD...,DEPARTMENT OF TRANSPORTATION,transportation-department,https://www.federalregister.gov/agencies/trans...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,https://www.federalregister.gov/api/v1/agencie...,
1,2018-10902,2018-05-22,77,Commodity Futures Trading Commission,,Foreign Futures and Options Transactions,The Commodity Futures Trading Commission (Comm...,Rule,The Commodity Futures Trading Commission (Comm...,COMMODITY FUTURES TRADING COMMISSION,commodity-futures-trading-commission,https://www.federalregister.gov/agencies/commo...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,https://www.federalregister.gov/api/v1/agencie...,


In [52]:
# Remove rows with empty data
print(df['abstract'].isnull().sum())
df=df.dropna(subset=['abstract'], how='all')
df.shape

2977


(7023, 17)

In [53]:
# What are the top agencies in the dataset?
df['agency'].value_counts().head()

Transportation Department               934
Commerce Department                     852
Health and Human Services Department    723
Homeland Security Department            602
Environmental Protection Agency         482
Name: agency, dtype: int64

In [54]:
# Create the target variable
df['target'] = (df['agency'] == 'Transportation Department')
df['target'].value_counts()

False    6089
True      934
Name: target, dtype: int64

In [55]:
train_df['abstract'][0]

'We are superseding Airworthiness Directive (AD) 2017-11-03 for DG Flugzeugbau GmbH Model DG-500MB gliders that are equipped with a Solo 2625 02 engine modified with a fuel injection system following the instructions of Solo Kleinmoteren GmbH Technische Mitteilung 4600-3 and identified as Solo 2625 02i. This AD results from mandatory continuing airworthiness information (MCAI) issued by an aviation authority of another country to identify and correct an unsafe condition on an aviation product. The MCAI describes the unsafe condition as failure of the connecting rod bearing resulting from too much load on the rod bearings from the engine control unit. This AD adds a model to the applicability. We are issuing this AD to require actions to address the unsafe condition on these products.'

## Step 4: Let's make sure we can tokenize these properly and remove stop words
<p>from <a href="http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html">CS Duke.edu</a></p>
<p>Make sure you've installed the english punctuation and stop words list 
following <a href="http://www.nltk.org/data.html">these instructions.</a>
</p>

#### Preprocessing

In [57]:
# Establish the column that we'll use as the X set
df['abstract'].head()

0    We are superseding Airworthiness Directive (AD...
1    The Commodity Futures Trading Commission (Comm...
2    We are extending the expiration date of Endocr...
3    The Coast Guard has issued a temporary deviati...
4    The Coast Guard has issued a temporary deviati...
Name: abstract, dtype: object

In [58]:
# strip punctuaction
from utils.email_utils import stripPunctuation
df['abstract'] = df['abstract'].apply(stripPunctuation)
df['abstract'].head()

0    We are superseding Airworthiness Directive AD ...
1    The Commodity Futures Trading Commission Commi...
2    We are extending the expiration date of Endocr...
3    The Coast Guard has issued a temporary deviati...
4    The Coast Guard has issued a temporary deviati...
Name: abstract, dtype: object

In [59]:
# convert to lower case
from utils.email_utils import preprocess
df['abstract'] = df['abstract'].apply(preprocess)
df['abstract'].head()

0    we are superseding airworthiness directive ad ...
1    the commodity futures trading commission commi...
2    we are extending the expiration date of endocr...
3    the coast guard has issued a temporary deviati...
4    the coast guard has issued a temporary deviati...
Name: abstract, dtype: object

In [60]:
# examine the tokenizer (note: this is just exploratory and does not modify the data)
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer
df['abstract'].apply(myTokenizer.tokenize).head()

0    [we, are, superseding, airworthiness, directiv...
1    [the, commodity, futures, trading, commission,...
2    [we, are, extending, the, expiration, date, of...
3    [the, coast, guard, has, issued, a, temporary,...
4    [the, coast, guard, has, issued, a, temporary,...
Name: abstract, dtype: object

In [61]:
# Count the tokens (note: this is just exploratory and does not modify the data)
df['abstract'].apply(countTokens).head()

0    124
1    158
2     70
3    101
4     63
Name: abstract, dtype: int64

## Step 5: convert train and test dfs to term X document representation matrices (_X)

In [66]:
# Establish our X and y
X = df['abstract']
y = df['target']

In [67]:
# Let's set aside 33% of our data for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [70]:
##Instantiate a TFidf vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', 
                             max_df=0.5, tokenizer=myTokenizer.tokenize)

In [74]:
#This step can take a while
tvec_train = vectorizer.fit_transform(X_train)

In [73]:
##Save feature names in a separate list
feature_names = vectorizer.get_feature_names()

In [75]:
#Create another matrix of tfidf scores for the documents in the test set
tvec_test = vectorizer.transform(X_test)

## Step 5: Instantiate and Train a Logistic Regression Classifier

In [80]:
#Create a logistic regression model
#From same tutorial https://github.com/zygmuntz/classifying-text/blob/master/bow_predict.py
model_lr = LR()
model_lr.fit(tvec_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Step 5B: Try a Naive Bayes Classifier

In [81]:
#Try a naive bayes classifier instead
model_nb = MultinomialNB(alpha=0.05)
model_nb.fit(tvec_train, y_train)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

## Step 6: Test classifier on test_X matrix

In [83]:
#Predict probability of class membership
p = model_lr.predict_proba(tvec_test)[:,1]
model_lr.score(tvec_test, y_test)

0.9641932700603969

### Test NB Classifier results

In [85]:
p2 = model_nb.predict(tvec_test)
model_nb.score(tvec_test, y_test)

0.9723899913718723

In [88]:
print(len(y_test))
len(p2)

2318


2318

In [90]:
probabilities = model_nb.predict_proba(tvec_test)

In [91]:
# Convert each component to a pandas dateframe
df_probs=pd.DataFrame(probabilities).reset_index(drop=True)
df_Xtest=pd.DataFrame(X_test).reset_index(drop=True)
df_ytest=pd.DataFrame(y_test).reset_index(drop=True)
# Reset a new index because we removed all the training data but never reset the index, so it has gaps
# and drop=True gets rid of the old index

final=pd.concat([df_Xtest, df_ytest, df_probs], axis=1)

In [96]:
# Let's look at some that had high probability of being assigned to the target agency
final[(final[0]<0.01)].head(5)

Unnamed: 0,abstract,target,0,1
3,we propose to adopt a new airworthiness direct...,True,8.934026e-07,0.999999
23,fmcsa announces its decision to renew exemptio...,True,4.959207e-07,1.0
24,in accordance with the procedures governing th...,True,0.0009016302,0.999098
34,we are superseding airworthiness directive ad ...,True,0.001053485,0.998947
43,this action proposes to amend class e airspace...,True,0.0003109738,0.999689


In [97]:
# Let's look at some that had high probability of NOT being assigned to the target agency
final[(final[0]>0.99)].head(5)

Unnamed: 0,abstract,target,0,1
2,epa has received applications to register new ...,False,0.999923,7.7e-05
4,the federal emergency management agency as par...,False,0.999977,2.3e-05
5,this document directs that a referendum be con...,False,0.997972,0.002028
6,we the us fish and wildlife service service ha...,False,0.999979,2.1e-05
7,the national institute for occupational safety...,False,0.99998,2e-05


In [92]:
# Let's look at some that had intermediate probability
final[(final[0]!=0.0) & (final[0]!=1.0)].head(10)

Unnamed: 0,abstract,target,0,1
0,nmfs closes the gulf of mexico angling categor...,False,0.9878835,0.012117
1,the us department of transportation federal hi...,True,0.9225769,0.077423
2,epa has received applications to register new ...,False,0.999923,7.7e-05
3,we propose to adopt a new airworthiness direct...,True,8.934026e-07,0.999999
4,the federal emergency management agency as par...,False,0.9999765,2.3e-05
5,this document directs that a referendum be con...,False,0.997972,0.002028
6,we the us fish and wildlife service service ha...,False,0.9999788,2.1e-05
7,the national institute for occupational safety...,False,0.9999801,2e-05
8,the commission is noticing a recent postal ser...,False,0.9999955,5e-06
9,this action provides public notice and solicit...,False,0.984674,0.015326


## Step 7: Evaluate Results
<p>Nice summary of different formulas for accuracy, precision, recall, etc 
<a href="http://www.damienfrancois.be/blog/files/modelperfcheatsheet.pdf">here</a></p>

In [99]:
#Write out results
final.head()

Unnamed: 0,abstract,target,0,1
0,nmfs closes the gulf of mexico angling categor...,False,0.9878835,0.012117
1,the us department of transportation federal hi...,True,0.9225769,0.077423
2,epa has received applications to register new ...,False,0.999923,7.7e-05
3,we propose to adopt a new airworthiness direct...,True,8.934026e-07,0.999999
4,the federal emergency management agency as par...,False,0.9999765,2.3e-05


In [104]:
THRESHOLD = .90
final['prediction'] = (final[1] >= THRESHOLD)
## When did the classifier get it right?
final.loc[final['prediction']==True].head(10)

Unnamed: 0,abstract,target,0,1,prediction
3,we propose to adopt a new airworthiness direct...,True,8.934026e-07,0.999999,True
23,fmcsa announces its decision to renew exemptio...,True,4.959207e-07,1.0,True
24,in accordance with the procedures governing th...,True,0.0009016302,0.999098,True
34,we are superseding airworthiness directive ad ...,True,0.001053485,0.998947,True
43,this action proposes to amend class e airspace...,True,0.0003109738,0.999689,True
65,this action proposes to modify class e airspac...,True,9.80608e-07,0.999999,True
68,this notice contains a summary of a petition s...,True,0.0006065552,0.999393,True
81,the secretary of transportation as represented...,True,6.227822e-06,0.999994,True
83,this action proposes to amend class d airspace...,True,9.613364e-08,1.0,True
95,the moving ahead for progress in the 21st cent...,True,0.02811111,0.971889,True


In [105]:
#Now let's see how well this model did, true positives, false positives, etc
def accuracy(tp, tn, fp, fn):
    return ((tp + tn)/(tp + tn + fp + fn))

def error_rate(tp, tn, fp, fn):
    return ((fp + fn)/ (tp + tn + fp + fn))

true_positives = len(final.loc[(final['target'] == True) & (final['prediction'] == True)])
false_positives = len(final.loc[(final['target'] == False) & (final['prediction'] == True)])
true_negatives = len(final.loc[(final['target'] == False) & (final['prediction'] == False)])
false_negatives = len(final.loc[(final['target'] == True) & (final['prediction'] == False)])
print("Results\nTrue Positives\tTrue_Negatives\tFalse_Positives\False_Negatives\n")
print("\t".join(map(str, [true_positives, true_negatives, false_positives, false_negatives])))

print("Classifier Accuracy: {}\n".format(accuracy(true_positives, true_negatives, 
                                                  false_positives, false_negatives)))
print("Classifier Error Rate: {}\n".format(error_rate(true_positives, true_negatives,
                                                      false_positives, false_negatives)))


Results
True Positives	True_Negatives	False_Positives\False_Negatives

232	1999	0	87
Classifier Accuracy: 0.9624676445211389

Classifier Error Rate: 0.037532355478861086



### Evaluate NB using metrics module

In [106]:
from sklearn import metrics
print(metrics.classification_report(y_test, p2))

             precision    recall  f1-score   support

      False       0.97      1.00      0.98      1999
       True       1.00      0.80      0.89       319

avg / total       0.97      0.97      0.97      2318



### Which features are most informative?
<p>Stolen/lifted from <a href="https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers">stack overflow</a></p>

In [107]:
#What are the most informative features in this test?
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

show_most_informative_features(vectorizer, model_nb, 25)

	-11.4533	0              		-4.9702	ad             
	-11.4533	00             		-5.4264	airworthiness  
	-11.4533	0003           		-5.4748	by             
	-11.4533	0007           		-5.4801	faa            
	-11.4533	001            		-5.5052	these          
	-11.4533	0011           		-5.5136	are            
	-11.4533	002            		-5.5170	we             
	-11.4533	0028           		-5.5568	airspace       
	-11.4533	00362          		-5.5658	model          
	-11.4533	0101           		-5.5672	safety         
	-11.4533	010106         		-5.5733	motor          
	-11.4533	01022018       		-5.5942	condition      
	-11.4533	011            		-5.6086	marad          
	-11.4533	01152018       		-5.6160	airplanes      
	-11.4533	01162018       		-5.6181	certain        
	-11.4533	0117           		-5.6206	unsafe         
	-11.4533	01182018       		-5.6330	airport        
	-11.4533	012            		-5.6955	from           
	-11.4533	014            		-5.7843	new            
	-11.4533	014313         		-5.8

## Step 9: More EDA -- A better way of compiling the vocabulary
<a href="http://nlpforhackers.io/tf-idf/">Source http://nlpforhackers.io/tf-idf/</a>

In [109]:
# build the vocabulary in one pass
vocabulary = set()
for text in df['abstract']:
    words = myTokenizer.tokenize(text)
    vocabulary.update(words)
 
vocabulary = list(vocabulary)
word_index = {w: idx for idx, w in enumerate(vocabulary)}
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(train_df.index)
 
print("Vocabulary size: {}\nDocuments Count: {}\n".format(
    VOCABULARY_SIZE, DOCUMENTS_COUNT))   

Vocabulary size: 20096
Documents Count: 5618

