## StumbleUpon Exercise

In [2]:
# Unicode Handling
from __future__ import unicode_literals
import pandas as pd
import json

data = pd.read_csv("../data/stumbleupon.tsv", sep='\t', encoding="utf-8")
data.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1


## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender.  

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### Data Cleaning

In [9]:
d = {'title': 'IBM'}
#d['titles']
d.get('titles', 'no text')

'no text'

In [4]:
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...


 ### Count Vectorizer

In [11]:
# fill in missing data
data['title'] = data['title'].fillna('')


from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_features = 1000, 
    ngram_range=(1, 2), 
    stop_words='english',
    binary=True
)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(data['title'])

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(data['title'])

In [20]:
X

<7395x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 25164 stored elements in Compressed Sparse Row format>

### Build a Logistic Regression model to predict evergreeness of a website using the title features

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# set up features and response
X = data['title']
y = data['label']

# create a train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# instantiate a vectorizer
vectorizer = CountVectorizer()

# Use `fit` to learn the vocabulary of the titles
# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X_train_vect = vectorizer.fit_transform(X_train)

# instantiate a model
logreg = LogisticRegression()
logreg.fit(X_train_vect, y_train)

# create predictions
X_test_vect = vectorizer.transform(X_test)
pred = logreg.predict(X_test_vect)

# evaluate the predictions
metrics.roc_auc_score(y_test, pred)

0.731184648971619

In [17]:
pred

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

### Exercise: Build a Naive Bayes model to predict evergreeness of a website using the title

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
# set up features and response
X = data['title']
y = data['label']

# create a train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# instantiate a vectorizer
vectorizer = CountVectorizer()

# Use `fit` to learn the vocabulary of the titles
# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X_train_vect = vectorizer.fit_transform(X_train)

# instantiate a model
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

# create predictions
X_test_vect = vectorizer.transform(X_test)
pred = nb.predict(X_test_vect)

# evaluate the predictions
metrics.roc_auc_score(y_test, pred)

0.7572265977203694

In [26]:
X_test_vect

<1849x4530 sparse matrix of type '<class 'numpy.int64'>'
	with 12685 stored elements in Compressed Sparse Row format>

In [22]:
X_train_vect

<5546x8918 sparse matrix of type '<class 'numpy.int64'>'
	with 38137 stored elements in Compressed Sparse Row format>

 ### Exercise: Build a Naive Bayes model to predict evergreeness of a website using the body features

In [31]:
%%time

# set up features and response
data['body'] = data['body'].fillna('')
X = data['body']
y = data['label']

# create a train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# instantiate a vectorizer
vectorizer = CountVectorizer()

# Use `fit` to learn the vocabulary of the titles
# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X_train_vect = vectorizer.fit_transform(X_train)

# instantiate a model
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

# create predictions
X_test_vect = vectorizer.transform(X_test)
pred = nb.predict(X_test_vect)

# evaluate the predictions
print(metrics.roc_auc_score(y_test, pred))

0.7825246206270302
Wall time: 6.45 s


57

 ### Exercise: Use `TfIdfVectorizer` instead of `CountVectorizer` - is this an improvement?

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
# set up features and response
X = data['title']
y = data['label']

# create a train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# instantiate a vectorizer
vectorizer = TfidfVectorizer(min_df=2)

# Use `fit` to learn the vocabulary of the titles
# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X_train_vect = vectorizer.fit_transform(X_train)

# instantiate a model
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

# create predictions
X_test_vect = vectorizer.transform(X_test)
pred = nb.predict(X_test_vect)

# evaluate the predictions
metrics.roc_auc_score(y_test, pred)

0.7531882006647512

### Feature Engineering

##### Create a feature for the title containing 'recipe'. Is the % of evegreen websites higher or lower on pages that have recipe in the the title?

In [17]:
# Option 1: Create a function to check for this
def has_recipe(text_in):
    try:
        if 'recipe' in str(text_in).lower():
            return 1
        else:
            return 0
    except: 
        return 0
        
data['recipe'] = data['title'].map(has_recipe)


# Option 2: lambda functions
data['recipe'] = data['title'].map(lambda t: 1 if 'recipe' in str(t).lower() else 0)


# Option 3: string functions
data['recipe'] = data['title'].str.contains('recipe')

data.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body,recipe
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,False
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,False


In [None]:
## TODO

### Improve the model

1. Try different models (e.g. KNeighborsClassifier)
2. Tune the model hyperparameters
3. Try new features

In [None]:
## TODO