In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

%matplotlib inline

In [347]:
# Here I am creating so helper functions to help with parsing out our text
def clean_soup(soup):
    for tag in soup.find_all(['script', 'style','meta']):
        tag.decompose()   
    return soup.get_text()


def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


In [None]:
import os

# This is the hard coded directory in which the raw html lives on my computer
rootdir = '/home/roland/Workspace/Data/Procrastinating_HTML/'
# Here we are creating a data frame to store the classified information in
data = pd.DataFrame(columns = ['activity', 'text', 'timestamp'])

# These strings were placed into the HTML by my browser extension to hold onto store the url
URL_STRING = "__URL__: "
HTML_START = "<"

for subdir, dirs, files in os.walk(rootdir):
    i = 0
    for file in files:
        f = open(os.path.join(subdir, file))
        raw = f.read()
        url = ''
        # Parsing out the URL string if it exists
        if URL_STRING in raw:
            url = find_between(raw, URL_STRING, HTML_START)
            raw = raw.replace(url, '')
        
        # Below are characters I am manually parsing out. There must exist a more efficient way to do this
        # but for now it runs fast enough.
        raw = raw.replace('\n', ' ')
        raw = raw.replace('\t', ' ')
        raw = raw.replace('\\n', ' ')
        raw = raw.replace('\\t', ' ')
        raw = raw.replace(URL_STRING, '')

        soup = BeautifulSoup(raw)
        data.set_value(i, 'text', clean_soup(soup))
        t = str(file).split('_')
        data.set_value(i, 'activity', t[0])
        timestamp = t[1].split('.')[0]
        data.set_value(i, 'timestamp', timestamp)
        data.set_value(i, 'url', url)
        i+=1
        f.close()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [4]:
data=data[data.text != ""]

In [5]:
data.timestamp = data.timestamp.str.replace("\(1\)","")
data.timestamp = data.timestamp.str.replace("T","-")

In [10]:
X = data[['text','url']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(data, data.activity,test_size=0.33, random_state=43)

## Model building

#### Building a grid searched pipeline

I am using a grid search over a pipeline to find the best model to use. After much trial and error I settled on a logistic regression model. First off it simply performaned the best. But additionally it is by far the most interpretible model. Also, Lasso Regularization is a great way to reduce the number of features generated by the countvectorization process, and the use of ngrams. At one point I had a dataframe with over 2,000,000 features. Lasso Regularization reduced that to a few hundred.

In [322]:
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer

# Let's create some stop words. I chose these values after doing a little bit of EDA.
stop = stopwords.words('english')
stop = stop + ['https', 'www', 'com', 'http']
cvt = CountVectorizer(stop_words=stop, ngram_range=[1,4])

# Here we are initializing the values we want to grid search over.
param_grid = dict(vect = [CountVectorizer()],
                  vect__ngram_range=[[1,3],[1,4]], # Trying different ngram ranges
                  vect__stop_words = [stop],
                tfidf = [TfidfTransformer()],
                tfidf__norm = [None],
                clf=[LogisticRegression()],
                clf__C=[.04,.1,.06, .07, .05], # Trying different coefficients for alpha
                clf__penalty=['l1'])
            

pipeline = Pipeline([
    ('vect', cvt),
    ('tfidf', TfidfTransformer(norm=None)),
    ('clf', LogisticRegression(penalty='l1'))
]) 

grid_search = GridSearchCV(pipeline, param_grid=param_grid)

grid_search.fit(X_train.text, y_train)
grid_search.best_estimator_.score(X_test.text, y_test)

0.97493036211699169

## Analyzing our results

Our accuracy score is looking great. 97.5% is great, but we should compare it to our baseline distribution before we get to excited.

In [346]:
# Caluclating our baseline
(y_train == 'work').sum()/float(len(y_train))

0.41483516483516486

So we have massively improved over random chance. This is a good start. We should look at some additional metrics as well to see if we have anything to be concerned about.

In [324]:
# Finding our best pipeline and pulling out the useful components
pipeline = grid_search.best_estimator_
lm =  pipeline.named_steps['clf']
vect = pipeline.named_steps['vect']

In [327]:
# This bit of code is pulling out my features that have coefficients greater than zero
# Lasso regularization reduces the coef 0 of the features (in our case unique ngrams)
import math
features =(vect.get_feature_names())
feature_dict = {}
for (i, f) in enumerate(features):
    if np.abs(lm.coef_[0][i]) > 0:
        feature_dict[f] = lm.coef_[0][i]

In [334]:
feature_df = pd.DataFrame.from_dict(feature_dict, orient='index')
feature_df.columns = ['coef']

Let's take a quick look oat our number of features and our total documents. We really do not want a model utilizing more features than we have documents. Our reguralization should have accounted for this, but it's not a bad idea to double check.

In [335]:
# How many documents do I have in my training set
print("Number of docs: " + str(len(X_train)))

# How many features do I have after reguralization
print("Number of features: " + str(len(feature_df)))

Number of docs: 728
Number of features: 168


In [344]:
# We can raise or logisitic regression coef to the e to calculate the odds ratio
feature_df['odds_ratio'] = feature_df['coef'].apply(np.exp)

Now let's look at what words are most associated with procrastination and productivity. We can sort our dataframe by odds ratio. The smaller odds ratio means words that are less related productivity, and a higher ratio means more related.

In [345]:
feature_df.sort_values('odds_ratio').head(10)

Unnamed: 0,coef,odds_ratio
game,-0.137337,0.871676
likes,-0.082925,0.92042
reddit,-0.078881,0.92415
photo,-0.063171,0.938783
attack,-0.057691,0.943942
src,-0.056463,0.945101
thwas,-0.055762,0.945764
video,-0.042215,0.958664
us,-0.041966,0.958902
5e,-0.038311,0.962414


In [343]:
feature_df.sort_values('odds_ratio', ascending=False).head(10)

Unnamed: 0,coef,odds_ratio
github,0.170815,1.186271
using,0.157641,1.170745
data,0.115566,1.122509
file,0.07185,1.074494
code,0.068924,1.071355
import,0.063022,1.06505
instagram,0.053534,1.054993
friction,0.052766,1.054183
stack,0.051788,1.053153
kurzgesagt,0.039332,1.040116


In [332]:
result_x =vect.transform(X_test.text)

In [318]:
X_test['prob_work'] = lm.predict_proba(result_x)[:,1]*100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [319]:
X_test['prob_work'] = X_test.prob_work.apply(int)
X_test['predict']  = lm.predict(result_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [320]:
X_test[X_test.activity != X_test.predict]

Unnamed: 0,activity,text,timestamp,url,prob_work,predict
818,procr,AnyDice AnyDice Dice Probability C...,2017-7-27-11-0-51,,57,work
147,work,K-means Clustering : pystatsjump to contentMy ...,2017-7-25-19-59-3,,42,procr
495,procr,dnd 5e - Multiclass Warlock/Wizard: Can I use...,2017-7-24-10-22-3,,75,work
997,procr,[OC] snapchat heat map aligning with path of E...,2017-8-21-15-13-0,https://www.reddit.com/r/dataisbeautiful/comme...,63,work
389,procr,Python (genus) - Wikipedia ...,2017-8-5-3-23-47,,72,work
505,work,Dashboard ...,2017-8-22-14-40-40,https://git.generalassemb.ly/orgs/DSI-DC-5/das...,48,procr
427,work,Balances ...,2017-8-22-14-9-28,https://ebranch.nasafcu.com/HBNet/App/Account/...,48,procr
855,work,Python-written open source tool to transform e...,2017-7-25-19-52-37,,37,procr
860,procr,Dead-Baby Jokes about | contac...,2017-8-18-12-34-50,http://www.skrause.org/humor/deadbaby.shtml,50,work
806,work,(1) Intro to vectors & scalars | One-di...,2017-7-20-18-20-21,,34,procr


The results from my initial run through have been extremely successful. I have a f7unctioning model with above a 95% accuracy. My base line is about 60% so this is a large improvement. I have used logistic regression with regularization to get my intial results, but I have some concerns about that approach.

I currently have 105000 features wiht non-zero coefficients. This is vastly more than the number of documents I have in my training set (~700). I have discussed several approaches to solving this issue. One, is that my model is surviving a train test split. It is accurate, despite having the potential to be over fitted. I think the reguralization is doing it's part. Nonetheless, I am going to try to reduce the number of features and see how much my accuracy is actually impacted.

Secondly I want to try different modeling methods. It has been suggested that I use and SVM. I will also try a random forest, given that they handle large numbers of features well.

Finally, I would like to do some more exploration of what features are correlated with what URLs. I have some artificat features that don't correlate with real words. I want to know if they only show up on certain websites, or if they exist accross multiple websites.

All in all, my model is in good shape. I have a strong predictability. I will spend some time doing additional exploration, but overall I think I am in good shape. I need to make some additional visualizations for my presentation (I have several in mind).