In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

%matplotlib inline

In [64]:
# This is where I parse out text
def clean_soup(soup):
    for tag in soup.find_all(['script', 'style','meta']):
        tag.decompose()   
    return soup.get_text()


def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


In [65]:
import os

rootdir = '/home/roland/Workspace/Data/Procrastinating_HTML/'
data = pd.DataFrame(columns = ['activity', 'text', 'timestamp'])

URL_STRING = "__URL__: "
HTML_START = "<"
for subdir, dirs, files in os.walk(rootdir):
    i = 0
    for file in files:
        f = open(os.path.join(subdir, file))
        raw = f.read()
        url = ''
        
        if URL_STRING in raw:
            url = find_between(raw, URL_STRING, HTML_START)
            raw = raw.replace(url, '')
        
        # Below are character I am manually parsing out. There must exist a more efficient way to do this
        # but for now it runs fast enough.
        raw = raw.replace('\n', ' ')
        raw = raw.replace('\t', ' ')
        raw = raw.replace('\\n', ' ')
        raw = raw.replace('\\t', ' ')
        raw = raw.replace(URL_STRING, '')

        soup = BeautifulSoup(raw)
        data.set_value(i, 'text', clean_soup(soup))
        t = str(file).split('_')
        data.set_value(i, 'activity', t[0])
        timestamp = t[1].split('.')[0]
        data.set_value(i, 'timestamp', timestamp)
        data.set_value(i, 'url', url)
        i+=1
        f.close()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [66]:
data=data[data.text != ""]

In [67]:
data.timestamp = data.timestamp.str.replace("\(1\)","")
data.timestamp = data.timestamp.str.replace("T","-")

In [68]:
X = data[['text','url']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(data, data.activity,test_size=0.33, random_state=43)

In [69]:
def evaluate_model(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    
    print(cm)
    print(cr)
    
    return accuracy

### Below is model building

In [70]:
# replacing any empty urls with empty strings
data.url = data.url.str.replace('NA', "")

In [71]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

#instantiate our model, countvectorizing works th best
cvec = CountVectorizer(stop_words='english')

#fit the count vectorizer to the data. This 'teaches' the count vectorizer the dictionary.
#cvec.fit(data.text)

In [72]:
# Vectorizing my text
result_train = pd.DataFrame(data=cvec.fit_transform(X_train.text).todense(), columns = cvec.get_feature_names())
result_test = pd.DataFrame(data=cvec.transform(X_test.text).todense(), columns= cvec.get_feature_names())


#### Making my features here. 
I have my vectorized text which works really well, but there is additional informaiton that I can engineer that might be useful for prediction. Below is where I am going to work thourgh that process.

In [73]:
lm = linear_model.LogisticRegression(fit_intercept=True, penalty='l1')
# fitting my training data  and scoring with my test data
lm.fit(result_train,y_train)
lm.score(result_test, y_test)
evaluate_model(lm, X_train=result_train, X_test=result_test)

[[192   7]
 [ 10 128]]
             precision    recall  f1-score   support

      procr       0.95      0.96      0.96       199
       work       0.95      0.93      0.94       138

avg / total       0.95      0.95      0.95       337



0.94955489614243327

In [74]:
len(list(filter(lambda a : a > 0, lm.coef_[0])))

55

In [75]:
features = result_train.columns[lm.coef_[0] > .03]

In [76]:
lm.fit(result_train[features],y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:

predict = lm.predict(result_test)
actual = data.activity


ValueError: X has 107457 features per sample; expecting 46

In [None]:

coef = zip(result_test.columns, np.exp((lm.coef_[0])))

sorted(coef,key = lambda a: a[1], reverse=False)

In [61]:
# baseline
(y_train == 'procr').sum()/float(len(y_train))

0.58625730994152048

In [None]:
X_test['prob_work'] = lm.predict_proba(result_test)[:,1]*100

In [None]:
X_test['prob_work'] = X_test.prob_work.apply(int)
X_test['predict']  = lm.predict(result_test)

In [None]:
X_test[X_test.activity != X_test.predict]

### Analyzing Count vectorized results

In [None]:
temp = result_train[list(X_train.activity =='procr')]

In [None]:
X_train['python_count']  = list(result_train['python'])

## Executive Summary

The results from my initial run through have been extremely successful. I have a f7unctioning model with above a 95% accuracy. My base line is about 60% so this is a large improvement. I have used logistic regression with regularization to get my intial results, but I have some concerns about that approach.

I currently have 105000 features wiht non-zero coefficients. This is vastly more than the number of documents I have in my training set (~700). I have discussed several approaches to solving this issue. One, is that my model is surviving a train test split. It is accurate, despite having the potential to be over fitted. I think the reguralization is doing it's part. Nonetheless, I am going to try to reduce the number of features and see how much my accuracy is actually impacted.

Secondly I want to try different modeling methods. It has been suggested that I use and SVM. I will also try a random forest, given that they handle large numbers of features well.

Finally, I would like to do some more exploration of what features are correlated with what URLs. I have some artificat features that don't correlate with real words. I want to know if they only show up on certain websites, or if they exist accross multiple websites.

All in all, my model is in good shape. I have a strong predictability. I will spend some time doing additional exploration, but overall I think I am in good shape. I need to make some additional visualizations for my presentation (I have several in mind).