Skip to content

Commit

Permalink
use edit distance feature
Browse files Browse the repository at this point in the history
  • Loading branch information
tpeng committed Jan 17, 2014
1 parent bdc96fd commit 371ebaf
Show file tree
Hide file tree
Showing 13 changed files with 2,317 additions and 72 deletions.
39 changes: 0 additions & 39 deletions README.md

This file was deleted.

41 changes: 41 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Webpager
========

A simple library to classify if an anchor on HTML page is a pagination link or not.

Installation
========

Clone the repository, then install package requirements
(package requires lxml, scikit-learn)::

$ pip install -r requirements.txt

then install package itself::

$ python setup.py install

Usage
========
Get a HTML page somewhere.::

>>> from urllib import urlopen
>>> url = 'http://www.tripadvisor.com/Restaurant_Review-g294217-d3639657-Reviews-Trattoria_Caffe_Monteverdi-Hong_Kong.html'
>>> html = urlopen(url).read()

Load web pager and classify.::

>>> from webpager import WebPager
>>> webpager = WebPager()
>>> for anchor, label in webpager.paginate(html, url):
>>> if label:
>>> print urljoin(url, anchor.get('href'))

http://www.tripadvisor.com/Restaurant_Review-g294217-d3639657-Reviews-or10-Trattoria_Caffe_Monteverdi-Hong_Kong.html#REVIEWS
http://www.tripadvisor.com/Restaurant_Review-g294217-d3639657-Reviews-or40-Trattoria_Caffe_Monteverdi-Hong_Kong.html#REVIEWS
http://www.tripadvisor.com/Restaurant_Review-g294217-d3639657-Reviews-or10-Trattoria_Caffe_Monteverdi-Hong_Kong.html#REVIEWS


Training
========
see `train.py` for more details.
18 changes: 18 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,20 @@
from setuptools import setup, find_packages
from distutils.extension import Extension
try:
from Cython.Distutils import build_ext
except ImportError:
use_cython = False
else:
use_cython = True

cmdclass = {}
ext_modules = []

if use_cython:
ext_modules.append(Extension("webpager.levenshtein_cython", ['webpager/levenshtein_cython.pyx']))
cmdclass.update({'build_ext': build_ext})
else:
ext_modules.append(Extension("webpager.levenshtein_cython", ['webpager/levenshtein_cython.c']))

setup(name='webpager',
version='0.1',
Expand All @@ -11,4 +27,6 @@
package_data = {
'webpager.models': ['*.pkl'],
},
cmdclass=cmdclass,
ext_modules=ext_modules
)
24 changes: 15 additions & 9 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from os.path import join
import posixpath

from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
Expand All @@ -15,20 +16,26 @@ def tagged_data(folder):
with open(path, 'r') as f:
yield path, f.read()

def get_original_urls(fname):
return dict([str(i+1), line.strip()] for i, line in enumerate(open(fname)))

if __name__ == '__main__':

htmlFeatGen = HtmlFeaturesExtractor()
anchorFeatGen = FeatureUnion(AnchorTransformers)

anchors = []
base_urls = get_original_urls('corpus/list.txt')
documents = []
labels = []

for _, html in tagged_data('corpus/annotated'):
anchor, label = htmlFeatGen.fit_transform(html, encoding='utf8')
anchors.extend(anchor)
labels.extend(label)
for path, html in tagged_data('corpus/annotated'):
_id = posixpath.split(path)[1].split('.')[0]
anchors_, labels_ = htmlFeatGen.fit_transform(html, baseurl=base_urls[_id], encoding='utf8')
documents.extend([(a, base_urls[_id]) for a in anchors_])
labels.extend(labels_)

train_anchors, test_anchors, train_labels, test_labels = train_test_split(anchors, labels, test_size=0.25, random_state=42)
train_documents = anchorFeatGen.fit_transform(train_anchors)
train_anchors, test_anchors, train_labels, test_labels = train_test_split(documents, labels, \
test_size=0.25, random_state=42)
train_documents = anchorFeatGen.fit_transform(train_anchors, y=train_labels)
test_documents = anchorFeatGen.transform(test_anchors)

clf = LogisticRegression(tol=1e-8, penalty='l2', C=7, class_weight='auto')
Expand All @@ -51,6 +58,5 @@ def show_most_informative_features(vectorizer, clf, n=20):

show_most_informative_features(anchorFeatGen, clf)

joblib.dump(htmlFeatGen, join('webpager', 'models', 'htmlFeatGen.joblib.pkl'), compress=9)
joblib.dump(anchorFeatGen, join('webpager', 'models', 'anchorFeatGen.joblib.pkl'), compress=9)
joblib.dump(clf, join('webpager', 'models', 'clf.joblib.pkl'), compress=9)
17 changes: 7 additions & 10 deletions webpager/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
from .models import get_models

from .features import HtmlFeaturesExtractor

class WebPager(object):
def __init__(self):
self.htmlFeatGen, self.anchorFeatGen, self.clf = get_models()
self.anchorFeatGen, self.clf = get_models()
self.htmlFeatGen = HtmlFeaturesExtractor()

def paginate(self, html):
anchors, _ = self.htmlFeatGen.fit_transform(html)
documents = self.anchorFeatGen.transform(anchors)
def paginate(self, html, base_url):
anchors, _ = self.htmlFeatGen.fit_transform(html, base_url)
documents = self.anchorFeatGen.transform([(anchor, base_url) for anchor in anchors])
labels = self.clf.predict(documents)

pages = []
for anchor, label in zip(anchors, labels):
pages.append((anchor, label))

return pages
return zip(anchors, labels)
41 changes: 32 additions & 9 deletions webpager/features.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import lxml
from lxml.html import tostring
from lxml.html.clean import Cleaner

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from .functions import parent_tag, block_length, number_pattern
from .functions import parent_tag, block_length, number_pattern, url_edit_distance
from .preprocess import Tagset


Expand All @@ -23,10 +26,12 @@
def tokenize(text):
return text.split()

def get_anchor_text(anchor):
def get_text(x):
anchor, _ = x
return anchor.text

def get_anchor_attr_text(anchor):
def get_attr_text(x):
anchor, _ = x
return anchor.get('class', '') + anchor.get('id', '')

default_funcs = (parent_tag, block_length, number_pattern)
Expand All @@ -52,18 +57,21 @@ def clean_html(cls, html, encoding=None):
def _parse_html(self, html, encoding=None):
return self.clean_html(html, encoding)

def fit_transform(self, X, y=None, encoding=None):
def fit_transform(self, X, baseurl, y=None, encoding=None):
"""
Convert the HTML data :param:X to list of the features.
:param:y is ignored.
"""
html = self.tagset.encode_tags(X)
doc = self.clean_html(html, encoding)
doc.make_links_absolute(baseurl)

anchors = []
labels = []
for anchor in doc.iter('a'):
tokens = self.tokenize(anchor.text or '')
no_tag_tokens = [token for token in tokens if not (self.tagset.start_tag_or_none(token) or self.tagset.end_tag_or_none(token))]
no_tag_tokens = [token for token in tokens if not \
(self.tagset.start_tag_or_none(token) or self.tagset.end_tag_or_none(token))]
anchor.text = u" " .join(no_tag_tokens)
anchors.append(anchor)
labels.append(1 if len(tokens) != len(no_tag_tokens) else 0)
Expand Down Expand Up @@ -97,7 +105,7 @@ class AnchorTextTransformer(BaseEstimator, TransformerMixin):
"""
Extract the text features for anchors.
"""
def __init__(self, get_text = lambda x: x.text):
def __init__(self, get_text):
self._get_text = get_text
self._vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=1, binary=True)

Expand All @@ -112,6 +120,21 @@ def transform(self, X):
texts = [self._get_text(x) for x in X]
return self._vectorizer.transform(texts)

AnchorTransformers = [('anchor_text', AnchorTextTransformer(get_anchor_text)),
('anchor_class_id', AnchorTextTransformer(get_anchor_attr_text)),
('anchor_misc', AnchorContextTransformer(default_funcs))]
class AnchorEditDistanceTransformer(BaseEstimator, TransformerMixin):

def get_feature_names(self):
return np.array(['edit_distance'])

def fit_transform(self, X, y=None):
distances = [url_edit_distance(x) for x in X]
r = np.array(distances)
r = np.reshape(r, (r.shape[0], 1))
return r

def transform(self, X):
return self.fit_transform(X)

AnchorTransformers = [('anchor_text', AnchorTextTransformer(get_text)),
('anchor_class_id', AnchorTextTransformer(get_attr_text)),
('anchor_misc', AnchorContextTransformer(default_funcs)),
('anchor_edit_distance', AnchorEditDistanceTransformer())]
16 changes: 13 additions & 3 deletions webpager/functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .levenshtein_cython import levenshtein_distance

def block_length(anchor):
def block_length(x):
anchor, _ = x
tokens = anchor.text.split()
if len(tokens) == 1:
bl = '1'
Expand All @@ -13,10 +15,12 @@ def block_length(anchor):
bl = 'large'
return {'block_length': bl}

def parent_tag(anchor):
def parent_tag(x):
anchor, _ = x
return {'parent_tag': anchor.getparent().tag}

def number_pattern(anchor):
def number_pattern(x):
anchor, _ = x
tokens = anchor.text.split()
digits = [1 for token in tokens if token.isdigit()]
if len(digits) == len(tokens):
Expand All @@ -27,3 +31,9 @@ def number_pattern(anchor):
np = 'no'
return {'number_pattern': np}

def url_edit_distance(x):
anchor, baseurl = x
href = anchor.get('href', '')
d = levenshtein_distance(baseurl, href)
# normalize
return float(d) / max(len(baseurl), len(href))
Loading

0 comments on commit 371ebaf

Please sign in to comment.