In [41]:
import pickle
import numpy
numpy.random.seed(42)

### The words (features) and authors (labels), already largely processed. These files should have been created from the previous (Lesson 10) mini-project.

In [42]:
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"

In [43]:
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )


### test_size is the percentage of events assigned to the test set (the remainder go into training)
### feature matrices changed to dense representations for compatibility with classifier functions in versions 0.15.2 and earlier

In [44]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')

In [46]:
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number of data points and a large number of features;
### train on only 150 events to put ourselves in this regime

In [47]:
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

## How many training points are there, according to the starter code?

In [48]:
len(labels_train)

150

## Get a decision tree up 

In [49]:
from sklearn import tree

In [50]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train,labels_train)
pred_40 = clf.predict(features_test)

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
acc_min_samples_split_40 =  accuracy_score(pred_40, labels_test)

In [16]:
print acc_min_samples_split_40

0.959044368601


## What’s the importance of the most important feature? 
## What is the number of this feature?

In [20]:
max_importance = max(clf.feature_importances_)
numpy.where(clf.feature_importances_==max_importance)

(array([33614], dtype=int64),)

In [22]:
clf.feature_importances_[33614]

0.76470588235294124

## What is the word that’s causing most of the discrimination of the decision tree? 

In [27]:
name_list = vectorizer.get_feature_names()
name_list[33614]

u'sshacklensf'

#### Go back to text_learning/vectorize_text.py, and remove this word from the emails using the same method you used to remove “sara”, “chris”, etc. Rerun vectorize_text.py, and once that finishes, rerun find_signature.py

## Any other outliers pop up? What word is it?

In [38]:
max_importance = max(clf.feature_importances_)
numpy.where(clf.feature_importances_==max_importance)

(array([14343], dtype=int64),)

In [39]:
clf.feature_importances_[14343]

0.66666666666666674

In [40]:
name_list = vectorizer.get_feature_names()
name_list[14343]

u'cgermannsf'

#### Update vectorize_test.py one more time, and rerun. 
### Then run find_signature.py again. Any other important features (importance>0.2) arise? How many? 

In [51]:
a = 0
for i in clf.feature_importances_:
    if i > 0.2:
        a += 1

In [52]:
print a


1


## What’s the accuracy of the decision tree now? 

In [53]:
acc_min_samples_split_40 =  accuracy_score(pred_40, labels_test)

In [54]:
print acc_min_samples_split_40

0.816837315131
