In [2]:
from pattern.vector import Document, Model, TFIDF, LEMMA

In [3]:
# I have a file of 2999 presidential documents,
# scraped from the American Presidency Project:
# http://www.presidency.ucsb.edu/
# The docs have been whitespace-condensed
# with regex and are now separated by newline
# characters.

fileObj = open("presidocs.txt", "r")
fileText = fileObj.read()
fileObj.close()

docStrings = fileText.split("\n")
print len(docStrings)

2999


In [4]:
# Stopwords are exlcluded by default. If we want to include
# them, we must add 'stopwords=True' to named arguments.
# If we want to stem words, which can reduce dimensionality
# for clustering, we add 'stemmer=LEMMA' or 'stemmer=STEMMER'
# to named arguments. This makes Document conversion process
# take much longer though.

docs = [Document(d, stemmer=LEMMA) for d in docStrings]

In [6]:
print docs[0].keywords(), '\n'
print docs[5].keywords(), '\n'
print docs[1500].keywords(), '\n'

[(0.03846153846153855, u'africa'), (0.021367521367521413, u'african'), (0.01709401709401713, u'agoa'), (0.01709401709401713, u'person'), (0.01709401709401713, u'united'), (0.014957264957264989, u'america'), (0.014957264957264989, u'help'), (0.012820512820512848, u'nation'), (0.012820512820512848, u'thank'), (0.012820512820512848, u'trade')] 

[(0.06230529595015576, u'nato'), (0.03115264797507788, u'threat'), (0.024922118380062305, u'disarm'), (0.021806853582554516, u'president'), (0.018691588785046728, u'alliance'), (0.018691588785046728, u'freedom'), (0.01557632398753894, u'prague'), (0.01557632398753894, u'q'), (0.012461059190031152, u'21st'), (0.012461059190031152, u'century')] 

[(0.022684310018903506, u'trade'), (0.018903591682419587, u'china'), (0.018903591682419587, u'issue'), (0.015122873345935671, u'senate'), (0.015122873345935671, u'world'), (0.013232514177693711, u'economy'), (0.013232514177693711, u'future'), (0.013232514177693711, u'person'), (0.013232514177693711, u'senat

In [4]:
m = Model(documents=docs, weight=TFIDF)

In [6]:
# Calculate some cosine similarities
print m.similarity(docs[0], docs[1])
print m.similarity(docs[5], docs[6])
print m.similarity(docs[1000], docs[2000])

0.0260771401337
0.22533937789
0.0


In [1]:
# Kmeans clustering
print m.cluster()

In [None]:
# Oh no, that's taking forever.... let's reduce dimensionality...
# We'll need to install numpy (pip install numpy) for this to work

m.reduce()

In [None]:
# That also took forever... machine learning can be time consuming
# Usually, this stuff is spread across multiple cores.... doing it
# on your laptop is less than ideal. But if you're patient, it works!

# For the patient, we can use hierarchical clustering, which unlike 
# kMeans is guaranteed to produce an optimal solution, but it can
# take a REALLY long time.... even if we have successfully reduced
# the model's dimensionality. (This one originally had 33,000 dimensions!)

print m.cluster(method=HIERARCHICAL, k=10)