In [258]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

%matplotlib inline

In [259]:
# This is where I parse out text
def clean_soup(soup):
    for tag in soup.find_all(['script', 'style','meta']):
        tag.decompose()   
    return soup.get_text()


def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


In [260]:
import os

rootdir = '/home/roland/Workspace/Data/Procrastinating_HTML/'
data = pd.DataFrame(columns = ['activity', 'text', 'timestamp'])

URL_STRING = "__URL__: "
HTML_START = "<"
for subdir, dirs, files in os.walk(rootdir):
    i = 0
    for file in files:
        f = open(os.path.join(subdir, file))
        raw = f.read()
        url = ''
        
        if URL_STRING in raw:
            url = find_between(raw, URL_STRING, HTML_START)
            raw = raw.replace(url, '')
        
        # Below are character I am manually parsing out. There must exist a more efficient way to do this
        # but for now it runs fast enough.
        raw = raw.replace('\n', ' ')
        raw = raw.replace('\t', ' ')
        raw = raw.replace('\\n', ' ')
        raw = raw.replace('\\t', ' ')
        raw = raw.replace(URL_STRING, '')

        soup = BeautifulSoup(raw)
        data.set_value(i, 'text', clean_soup(soup))
        t = str(file).split('_')
        data.set_value(i, 'activity', t[0])
        timestamp = t[1].split('.')[0]
        data.set_value(i, 'timestamp', timestamp)
        data.set_value(i, 'url', url)
        i+=1
        f.close()




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [261]:
data=data[data.text != ""]

In [262]:
data.timestamp = data.timestamp.str.replace("\(1\)","")
data.timestamp = data.timestamp.str.replace("T","-")

In [306]:
X = data[['text','url']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(data, data.activity,test_size=0.33, random_state=43)

In [307]:
def evaluate_model(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    
    print(cm)
    print(cr)
    
    return accuracy

### Working with lemmatizing

In [308]:
# I am not using any of these libraries yet
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

### Below is model building

In [309]:
# replacing any empty urls with empty strings
data.url = data.url.str.replace('NA', "")

In [310]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

#instantiate our model, countvectorizing works th best
cvec = CountVectorizer(stop_words='english')

#fit the count vectorizer to the data. This 'teaches' the count vectorizer the dictionary.
#cvec.fit(data.text)

In [311]:
# # Vectorizing my text
# result_train = pd.DataFrame(data=cvec.transform(X_train.text).todense(), columns=cvec.vocabulary_)
# result_test = pd.DataFrame(data=cvec.transform(X_test.text).todense(), columns=cvec.vocabulary_)

# #NOTE TO SELF

# # Try using fuzzy matching on URLs. Try using the partial matching thing. Then label encode the ones that match well.

In [333]:
# Vectorizing my text
result_train = pd.DataFrame(data=cvec.fit_transform(X_train.text).todense(), columns = cvec.vocabulary_)
result_test = pd.DataFrame(data=cvec.transform(X_test.text).todense(), columns= cvec.vocabulary_)

#NOTE TO SELF

# Try using fuzzy matching on URLs. Try using the partial matching thing. Then label encode the ones that match well.

In [334]:
set(result_train.columns) - set(result_test.columns)

set()

#### Making my features here. 
I have my vectorized text which works really well, but there is additional informaiton that I can engineer that might be useful for prediction. Below is where I am going to work thourgh that process.

In [335]:
lm = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
# fitting my training data  and scoring with my test data
lm.fit(result_train,y_train)
lm.score(result_test, y_test)
evaluate_model(lm, X_train=result_train, X_test=result_test)

[[175   6]
 [  8 136]]
             precision    recall  f1-score   support

      procr       0.96      0.97      0.96       181
       work       0.96      0.94      0.95       144

avg / total       0.96      0.96      0.96       325



0.95692307692307688

In [336]:

predict = lm.predict(result_test)
actual = data.activity


In [337]:

coef = zip(result_test.columns, np.exp((lm.coef_[0])))

sorted(coef,key = lambda a: a[1], reverse=True)

[('123456789101112', 1.5030782244278027),
 ('1805', 1.2236868646314289),
 ('spacey', 1.1930074416362264),
 ('csoulr666', 1.168849680627585),
 ('61st', 1.1680991787213784),
 ('background2instagram', 1.1609333818514178),
 ('a3b0966', 1.1507351778717072),
 ('corraaaal', 1.1307716488366686),
 ('ucb', 1.1226528736040287),
 ('dashboard290', 1.121188690551755),
 ('usr', 1.1087638929433088),
 ('murlocs', 1.1048573721045738),
 ('braff', 1.1045683179905139),
 ('tangotiger', 1.1016686272892013),
 ('die4ever', 1.101661134620783),
 ('startups', 1.1010777942707792),
 ('bzzzzt', 1.1005131274527098),
 ('callsabstract', 1.1000895079882986),
 ('scripttext', 1.0963482960152977),
 ('caprikel', 1.0943962275311907),
 ('commentssharesavehidereport21101112game', 1.0936214569062455),
 ('prisontattoos', 1.093039202962242),
 ('lamprey', 1.092929885126964),
 ('points5509', 1.0924658452006402),
 ('hernandez', 1.0912919862497945),
 ('0375829', 1.0894421819203137),
 ('dmozspider', 1.0873318168549091),
 ('brokencompa

In [338]:
# baseline
(y_train == 'procr').sum()/float(len(y_train))

0.60030395136778114

In [339]:
sum(y_train == 'work')

263

### Analyzing Count vectorized results

In [340]:
temp = result_train[list(X_train.activity =='procr')]

In [341]:
X_train['python_count']  = list(result_train['python'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [360]:
temp = X_train[X_train.python_count > 0].sort_values('python_count', ascending = False)
list(temp[temp.activity == 'procr'].text)

733     AnyDice            AnyDice Dice Probability C...
950                                                  ...
Name: text, dtype: object

In [361]:
new_temp = pd.DataFrame(data=cvec.fit_transform(temp[temp.activity == 'procr'].iloc[0:2].text).todense(), columns=cvec.vocabulary_)

In [367]:
cvec.vocabulary_

{'anydice': 334,
 'dice': 515,
 'probability': 898,
 'calculator': 408,
 'articles': 343,
 'stay': 1028,
 'connected': 458,
 'twitter': 1111,
 'facebook': 589,
 'google': 653,
 'currently': 482,
 'playing': 881,
 'jasper': 717,
 'flick': 609,
 'output': 855,
 '3d4': 174,
 'view': 1140,
 'data': 497,
 'calculating': 407,
 '92': 287,
 '00': 0,
 '42': 180,
 '240': 126,
 '250': 129,
 '260': 133,
 '270': 135,
 '280': 139,
 '290': 141,
 '300': 143,
 '310': 145,
 '320': 150,
 '330': 153,
 '340': 156,
 '350': 158,
 '360': 162,
 '370': 164,
 '380': 166,
 '390': 171,
 '400': 176,
 '410': 178,
 '420': 181,
 '430': 184,
 '440': 186,
 '450': 188,
 '460': 190,
 '470': 194,
 '480': 196,
 '490': 198,
 '500': 202,
 '510': 205,
 '520': 206,
 '530': 210,
 '540': 214,
 '550': 218,
 '560': 220,
 '570': 222,
 '580': 225,
 '590': 227,
 '01': 3,
 '600': 228,
 '610': 230,
 '02': 6,
 '620': 231,
 '630': 234,
 '03': 7,
 '640': 235,
 '05': 9,
 '650': 237,
 '07': 14,
 '660': 238,
 '09': 16,
 '670': 242,
 '12': 45,

In [355]:
list(temp[temp.activity == 'procr'].iloc[0:1].text)

[" AnyDice            AnyDice Dice Probability Calculator   Articles      Stay Connected  Twitter,     Facebook,     Google+  Currently Playing     made by Jasper Flick     output 3d4   View       Data       ...calculating...   output 1 (92.00 / 9.42)#%240.00\xa0250.00\xa0260.00\xa0270.00\xa0280.00\xa0290.00\xa0300.00\xa0310.00\xa0320.00\xa0330.00\xa0340.00\xa0350.00\xa0360.00\xa0370.00\xa0380.00\xa0390.00\xa0400.00\xa0410.00\xa0420.00\xa0430.00\xa0440.00\xa0450.00\xa0460.00\xa0470.00\xa0480.00\xa0490.00\xa0500.00\xa0510.00\xa0520.00\xa0530.00\xa0540.00\xa0550.00\xa0560.00\xa0570.00\xa0580.00\xa0590.01\xa0600.01\xa0610.02\xa0620.02\xa0630.03\xa0640.05\xa0650.07\xa0660.09\xa0670.12\xa0680.16\xa0690.21\xa0700.28\xa0710.35\xa0720.45\xa0730.56\xa0740.69\xa0750.84\xa0761.01\xa0771.21\xa0781.42\xa0791.65\xa0801.90\xa0812.16\xa0822.42\xa0832.69\xa0842.96\xa0853.21\xa0863.45\xa0873.67\xa0883.85\xa0894.00\xa0904.12\xa0914.18\xa0924.21\xa0934.18\xa0944.12\xa0954.00\xa0963.85\xa0973.67\xa0983.45\