In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

%matplotlib inline

In [2]:
# This is where I parse out text
def clean_soup(soup):
    for tag in soup.find_all(['script', 'style','meta']):
        tag.decompose()   
    return soup.get_text()


def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


In [None]:
import os

rootdir = '/home/roland/Workspace/Data/Procrastinating_HTML/'
data = pd.DataFrame(columns = ['activity', 'text', 'timestamp'])

URL_STRING = "__URL__: "
HTML_START = "<"
for subdir, dirs, files in os.walk(rootdir):
    i = 0
    for file in files:
        f = open(os.path.join(subdir, file))
        raw = f.read()
        url = ''
        
        if URL_STRING in raw:
            url = find_between(raw, URL_STRING, HTML_START)
            raw = raw.replace(url, '')
        
        # Below are character I am manually parsing out. There must exist a more efficient way to do this
        # but for now it runs fast enough.
        raw = raw.replace('\n', ' ')
        raw = raw.replace('\t', ' ')
        raw = raw.replace('\\n', ' ')
        raw = raw.replace('\\t', ' ')
        raw = raw.replace(URL_STRING, '')

        soup = BeautifulSoup(raw)
        data.set_value(i, 'text', clean_soup(soup))
        t = str(file).split('_')
        data.set_value(i, 'activity', t[0])
        timestamp = t[1].split('.')[0]
        data.set_value(i, 'timestamp', timestamp)
        data.set_value(i, 'url', url)
        i+=1
        f.close()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [None]:
data=data[data.text != ""]

In [None]:
data.timestamp = data.timestamp.str.replace("\(1\)","")
data.timestamp = data.timestamp.str.replace("T","-")

In [None]:
X = data[['text','url']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(data, data.activity,test_size=0.33, random_state=43)

In [None]:
def evaluate_model(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    
    print(cm)
    print(cr)
    
    return accuracy

### Working with lemmatizing

In [None]:
# I am not using any of these libraries yet
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

### Below is model building

In [None]:
# replacing any empty urls with empty strings
data.url = data.url.str.replace('NA', "")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

#instantiate our model, countvectorizing works th best
cvec = CountVectorizer(stop_words='english')

#fit the count vectorizer to the data. This 'teaches' the count vectorizer the dictionary.
#cvec.fit(data.text)

In [None]:
# # Vectorizing my text
# result_train = pd.DataFrame(data=cvec.transform(X_train.text).todense(), columns=cvec.vocabulary_)
# result_test = pd.DataFrame(data=cvec.transform(X_test.text).todense(), columns=cvec.vocabulary_)

# #NOTE TO SELF

# # Try using fuzzy matching on URLs. Try using the partial matching thing. Then label encode the ones that match well.

In [None]:
# Vectorizing my text
result_train = pd.DataFrame(data=cvec.fit_transform(X_train.text).todense(), columns = cvec.get_feature_names())
result_test = pd.DataFrame(data=cvec.transform(X_test.text).todense(), columns= cvec.get_feature_names())

#NOTE TO SELF

# Try using fuzzy matching on URLs. Try using the partial matching thing. Then label encode the ones that match well.

In [None]:
set(result_train.columns) - set(result_test.columns)

len(result_train.columns)

#### Making my features here. 
I have my vectorized text which works really well, but there is additional informaiton that I can engineer that might be useful for prediction. Below is where I am going to work thourgh that process.

In [None]:
lm = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
# fitting my training data  and scoring with my test data
lm.fit(result_train,y_train)
lm.score(result_test, y_test)
evaluate_model(lm, X_train=result_train, X_test=result_test)

In [None]:

predict = lm.predict(result_test)
actual = data.activity


In [None]:

coef = zip(result_test.columns, np.exp((lm.coef_[0])))

sorted(coef,key = lambda a: a[1], reverse=False)

In [None]:
# baseline
(y_train == 'procr').sum()/float(len(y_train))

In [None]:
X_test['prob_work'] = lm.predict_proba(result_test)[:,1]*100

In [None]:
X_test['prob_work'] = X_test.prob_work.apply(int)
X_test['predict']  = lm.predict(result_test)

In [None]:
X_test[X_test.activity != X_test.predict]

### Analyzing Count vectorized results

In [None]:
temp = result_train[list(X_train.activity =='procr')]

In [None]:
X_train['python_count']  = list(result_train['python'])

In [412]:
temp = X_train[X_train.python_count > 0].sort_values('python_count', ascending = False)
list(temp[temp.activity == 'procr'].iloc[0:1].text)

[" AnyDice            AnyDice Dice Probability Calculator   Articles      Stay Connected  Twitter,     Facebook,     Google+  Currently Playing     made by Jasper Flick     output 3d4   View       Data       ...calculating...   output 1 (92.00 / 9.42)#%240.00\xa0250.00\xa0260.00\xa0270.00\xa0280.00\xa0290.00\xa0300.00\xa0310.00\xa0320.00\xa0330.00\xa0340.00\xa0350.00\xa0360.00\xa0370.00\xa0380.00\xa0390.00\xa0400.00\xa0410.00\xa0420.00\xa0430.00\xa0440.00\xa0450.00\xa0460.00\xa0470.00\xa0480.00\xa0490.00\xa0500.00\xa0510.00\xa0520.00\xa0530.00\xa0540.00\xa0550.00\xa0560.00\xa0570.00\xa0580.00\xa0590.01\xa0600.01\xa0610.02\xa0620.02\xa0630.03\xa0640.05\xa0650.07\xa0660.09\xa0670.12\xa0680.16\xa0690.21\xa0700.28\xa0710.35\xa0720.45\xa0730.56\xa0740.69\xa0750.84\xa0761.01\xa0771.21\xa0781.42\xa0791.65\xa0801.90\xa0812.16\xa0822.42\xa0832.69\xa0842.96\xa0853.21\xa0863.45\xa0873.67\xa0883.85\xa0894.00\xa0904.12\xa0914.18\xa0924.21\xa0934.18\xa0944.12\xa0954.00\xa0963.85\xa0973.67\xa0983.45\

In [455]:
cvec.fit((temp[temp.activity == 'procr'].iloc[0:8].text))
new_temp = pd.DataFrame(data=cvec.transform(temp[temp.activity == 'procr'].iloc[0:8].text).todense(), columns = cvec.get_feature_names())

In [457]:
new_temp['anydice']

0    4
1    0
2    0
3    0
4    0
5    0
6    0
7    0
Name: anydice, dtype: int64

In [454]:
list(temp[temp.activity == 'procr'].iloc[0:1].text)

[" AnyDice            AnyDice Dice Probability Calculator   Articles      Stay Connected  Twitter,     Facebook,     Google+  Currently Playing     made by Jasper Flick     output 3d4   View       Data       ...calculating...   output 1 (92.00 / 9.42)#%240.00\xa0250.00\xa0260.00\xa0270.00\xa0280.00\xa0290.00\xa0300.00\xa0310.00\xa0320.00\xa0330.00\xa0340.00\xa0350.00\xa0360.00\xa0370.00\xa0380.00\xa0390.00\xa0400.00\xa0410.00\xa0420.00\xa0430.00\xa0440.00\xa0450.00\xa0460.00\xa0470.00\xa0480.00\xa0490.00\xa0500.00\xa0510.00\xa0520.00\xa0530.00\xa0540.00\xa0550.00\xa0560.00\xa0570.00\xa0580.00\xa0590.01\xa0600.01\xa0610.02\xa0620.02\xa0630.03\xa0640.05\xa0650.07\xa0660.09\xa0670.12\xa0680.16\xa0690.21\xa0700.28\xa0710.35\xa0720.45\xa0730.56\xa0740.69\xa0750.84\xa0761.01\xa0771.21\xa0781.42\xa0791.65\xa0801.90\xa0812.16\xa0822.42\xa0832.69\xa0842.96\xa0853.21\xa0863.45\xa0873.67\xa0883.85\xa0894.00\xa0904.12\xa0914.18\xa0924.21\xa0934.18\xa0944.12\xa0954.00\xa0963.85\xa0973.67\xa0983.45\