In [1]:
from __future__ import division, print_function
from sklearn.externals.six.moves import html_parser
from glob import glob
import collections
import nltk
import os
import re

In [2]:
class ReutersParser(html_parser.HTMLParser):
    """ Utility class to parse a SGML file and yield documents one at 
        a time. 
    """
    def __init__(self, encoding='latin-1'):
        html_parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def handle_starttag(self, tag, attrs):
        method = 'start_' + tag
        getattr(self, method, lambda x: None)(attrs)

    def handle_endtag(self, tag):
        method = 'end_' + tag
        getattr(self, method, lambda: None)()

    def _reset(self):
        self.in_title = 0
        self.in_body = 0
        self.in_topics = 0
        self.in_topic_d = 0
        self.title = ""
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_data(self, data):
        if self.in_body:
            self.body += data
        elif self.in_title:
            self.title += data
        elif self.in_topic_d:
            self.topic_d += data

    def start_reuters(self, attributes):
        pass

    def end_reuters(self):
        self.body = re.sub(r'\s+', r' ', self.body)
        self.docs.append({'title': self.title,
                          'body': self.body,
                          'topics': self.topics})
        self._reset()

    def start_title(self, attributes):
        self.in_title = 1

    def end_title(self):
        self.in_title = 0

    def start_body(self, attributes):
        self.in_body = 1

    def end_body(self):
        self.in_body = 0

    def start_topics(self, attributes):
        self.in_topics = 1

    def end_topics(self):
        self.in_topics = 0

    def start_d(self, attributes):
        self.in_topic_d = 1

    def end_d(self):
        self.in_topic_d = 0
        self.topics.append(self.topic_d)
        self.topic_d = ""


In [3]:
def stream_reuters_documents(reuters_dir):
    """ Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """
    parser = ReutersParser()
    for filename in glob(os.path.join(reuters_dir, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc

In [4]:
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs".format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab

In [5]:
DATA_DIR = "/home/canwill/workspace/documentsimilarity/data/"
REUTERS_DIR = os.path.join(DATA_DIR, "reuters21578")
VOCAB_FILE = os.path.join(DATA_DIR, "vocab.txt")
VOCAB_SIZE = 5000
print(VOCAB_FILE)
print(REUTERS_DIR)

/home/canwill/workspace/documentsimilarity/data/vocab.txt
/home/canwill/workspace/documentsimilarity/data/reuters21578


In [6]:
vocab = maybe_build_vocab(REUTERS_DIR, VOCAB_FILE)
vocab

building vocab from 0 docs
building vocab from 100 docs
building vocab from 200 docs
building vocab from 300 docs
building vocab from 400 docs
building vocab from 500 docs
building vocab from 600 docs
building vocab from 700 docs
building vocab from 800 docs
building vocab from 800 docs
building vocab from 900 docs
building vocab from 1000 docs
building vocab from 1100 docs
building vocab from 1200 docs
building vocab from 1300 docs
building vocab from 1400 docs
building vocab from 1500 docs
building vocab from 1600 docs
building vocab from 1700 docs
building vocab from 1800 docs
building vocab from 1900 docs
building vocab from 2000 docs
building vocab from 2100 docs
building vocab from 2200 docs
building vocab from 2300 docs
building vocab from 2400 docs
building vocab from 2500 docs
building vocab from 2600 docs
building vocab from 2700 docs
building vocab from 2800 docs
building vocab from 2800 docs
building vocab from 2800 docs
building vocab from 2900 docs
building vocab from 300

defaultdict(int,
            {u'hallwood': 5000,
             u'barrages': 4987,
             u'multiplexor': 4998,
             u'four': 194,
             u'woods': 5000,
             u'travaux': 5000,
             u'francesco': 5000,
             u'amgd': 4997,
             u'cyprus': 5000,
             u'11,921,000': 4999,
             u'papandreou': 4999,
             u'looking': 1375,
             u'midafternoon': 4988,
             u'sl-1': 4989,
             u'granting': 5000,
             u'eligible': 2943,
             u'electricity': 2722,
             u'161.6': 4999,
             u'2,779,000': 4992,
             u'153.3': 5000,
             u'packers': 4993,
             u'hermann': 4993,
             u'lord': 5000,
             u'meadows': 4994,
             u'sinking': 2908,
             u'hormone': 5000,
             u'co-operation': 5000,
             u'45.0': 5000,
             u'45.7': 4695,
             u'45.6': 4996,
             u'45.5': 4997,
             u'heublei

In [7]:
ftext = open(os.path.join(DATA_DIR, "text.tsv"), "wb")
ftags = open(os.path.join(DATA_DIR, "tags.tsv"), "wb")
num_read = 0
for doc in stream_reuters_documents(REUTERS_DIR):
    # skip docs without specified topic
    topics = doc["topics"]
    if len(topics) == 0:
        continue
    title = doc["title"]
    body = doc["body"]
    num_read += 1
    # concatenate title and body and convert to list of word indexes
    title_body = ". ".join([title, body]).lower()
    title_body = re.sub("\n", "", title_body)
    title_body = title_body.encode("utf8").decode("ascii", "ignore")
    ftext.write("{:d}\t{:s}\n".format(num_read, title_body))
    ftags.write("{:d}\t{:s}\n".format(num_read, ",".join(topics)))

ftext.close()
ftags.close()

#### Source: src/tag-sims.py

In [None]:
from __future__ import division, print_function
from sklearn.feature_extraction.text import CountVectorizer
import os
import re

import dsutils

DATA_DIR = "/home/canwill/workspace/documentsimilarity/data/"
VECTORS_FILE = os.path.join(DATA_DIR, "tag-vecs.mtx")

tags = []
ftags = open(os.path.join(DATA_DIR, "tags.tsv"), "rb")
for line in ftags:
    docid, taglist = line.strip().split("\t")
    taglist = re.sub(",", " ", taglist)
    tags.append(taglist)
ftags.close()

cvec = CountVectorizer()
X = cvec.fit_transform(tags)

dsutils.save_vectors(X, VECTORS_FILE, is_sparse=True)

#### Source: src/wordcount-sims.py

In [26]:
from __future__ import division, print_function
from sklearn.feature_extraction.text import CountVectorizer
import os

import dsutils

DATA_DIR = "/home/canwill/workspace/documentsimilarity/data/"
MAX_FEATURES = 500
VECTORS_FILE = os.path.join(DATA_DIR, 
    "wordcount-{:d}-vecs.mtx".format(MAX_FEATURES))

texts = []
ftext = open(os.path.join(DATA_DIR, "text.tsv"), "rb")
for line in ftext:
    docid, text = line.strip().split("\t")
    texts.append(text)
ftext.close()

cvec = CountVectorizer(max_features=MAX_FEATURES,
                       stop_words="english", 
                       binary=True)
X = cvec.fit_transform(texts)

dsutils.save_vectors(X, VECTORS_FILE, is_sparse=True)
X

<19716x500 sparse matrix of type '<type 'numpy.int64'>'
	with 522487 stored elements in Compressed Sparse Row format>

#### Source: src/calc-pearson.py

In [None]:
from __future__ import division, print_function
from scipy import stats
import os
import time

import dsutils

DATA_DIR = "/home/canwill/workspace/documentsimilarity/data/"

VECTORIZER = "wordcount"
#VECTORIZER = "tfidf"
#VECTORIZER = "lsa"
#VECTORIZER = "glove"
#VECTORIZER = "w2v"

X_IS_SPARSE = True
Y_IS_SPARSE = True
#Y_IS_SPARSE = False

NUM_FEATURES = 10

XFILE = os.path.join(DATA_DIR, "tag-vecs.mtx")
YFILE = os.path.join(DATA_DIR, "{:s}-{:d}-vecs.{:s}"
    .format(VECTORIZER, NUM_FEATURES, 
            "mtx" if Y_IS_SPARSE else "csv"))

X = dsutils.load_vectors(XFILE, is_sparse=X_IS_SPARSE)
Y = dsutils.load_vectors(YFILE, is_sparse=Y_IS_SPARSE)

XD = dsutils.compute_cosine_sims(X, is_sparse=X_IS_SPARSE)
YD = dsutils.compute_cosine_sims(Y, is_sparse=Y_IS_SPARSE)

XDT = dsutils.get_upper_triangle(XD, is_sparse=X_IS_SPARSE)
YDT = dsutils.get_upper_triangle(YD, is_sparse=Y_IS_SPARSE)

corr, _ = stats.pearsonr(XDT, YDT)
print("Pearson correlation: {:.3f}".format(corr))

#### Source: src/tfidf-sims.py

In [28]:
from __future__ import division, print_function
from sklearn.feature_extraction.text import TfidfVectorizer
import os

import dsutils

DATA_DIR = "/home/canwill/workspace/documentsimilarity/data/"
MAX_FEATURES = 50
VECTORS_FILE = os.path.join(DATA_DIR, 
    "tfidf-{:d}-vecs.mtx".format(MAX_FEATURES))

texts = []
ftext = open(os.path.join(DATA_DIR, "text.tsv"), "rb")
for line in ftext:
    docid, text = line.strip().split("\t")
    texts.append(text)
ftext.close()

tvec = TfidfVectorizer(max_features=MAX_FEATURES,
                       min_df=0.1, sublinear_tf=True,
                       stop_words="english",
                       binary=True)
X = tvec.fit_transform(texts)

dsutils.save_vectors(X, VECTORS_FILE, is_sparse=True)
X.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.26718496,  0.        ,  0.34797382, ...,  0.        ,
          0.30896004,  0.23438765],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.410122  , ...,  0.        ,
          0.36414036,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.45430823,  0.34465376]])