In [1]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [2]:
import string
import collections

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/csb5t/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def process_text(text):
    #text = text.translate(None, string.punctuation)
    tokens = word_tokenize(text)
 
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(t) for t in tokens]
 
    return stemmed_tokens

In [4]:
def cluster_texts(texts, clusters=10):
    """ tfidf and cluster texts using kmeans """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.8,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [6]:
import glob
import os

In [9]:
text_array = []
for fn in glob.glob("plain/*.txt"):
    with open (os.path.join(fn), 'r', encoding='latin-1') as f:
        text_array.append(f.read())

In [10]:
len(text_array)

1417

In [15]:
text_array[:5]

 'IN A CORNISH TOWNSHIP WITH OLD VOGUE FOLKIN A CORNISH TOWNSHIP WITH OLD VOGUE FOLK BY DOLLY PENTREATH ILLUSTRATED BY PERCY R. CRAFT % o it & o n T. FISHER UNWIN Paternoster Square 1893LIST OF ILLUSTRATIONS. â\x80\x94â\x99¦ l 1 AGE 1. Polvogue ..... Frontispiece. 2. Lord Respry ...... 14 3. Mar Teazer ...... 27 4. Woolly Woollaton . . . 32 5. Squire Johnnie Pencoose .... 36 6. â\x80\x98A Fine English Girl carries the Palm â\x80\x99 . . 43 7. Billy Pearce was there and heard every word . 56 8. â\x80\x98 â\x80\x99Tis a Savage Country\' â\x80\x99 .... 67 9. â\x80\x98Viâ\x80\x99let, She were more plagued like than the Christian â\x80\x99 . . . . . .76 10. â\x80\x98Moo-Sick, Beau-ti-ful Moo-Sickâ\x80\x99 ... 99 11. â\x80\x98 â\x80\x99Tis the Fashion to Smuggle â\x80\x99 . . .107 12. Miss Fanny had made a Lovely Neck of Corn . 111 13. â\x80\x98It is better to be Plain and Straightforward in Words,â\x80\x99 said Madame Pencoose . . . 120 14. Betty Neptune ...... 126 15. â\x80\x98Whatâ\x80\x9

In [13]:
pprint(dict(cluster_texts(text_array)))

  sorted(inconsistent))


{0: [6,
     12,
     15,
     26,
     27,
     64,
     66,
     68,
     76,
     99,
     105,
     112,
     144,
     146,
     147,
     148,
     149,
     154,
     159,
     160,
     166,
     172,
     176,
     187,
     197,
     199,
     223,
     225,
     227,
     232,
     234,
     237,
     239,
     245,
     251,
     263,
     280,
     283,
     296,
     321,
     333,
     345,
     346,
     370,
     374,
     378,
     413,
     415,
     429,
     430,
     431,
     434,
     446,
     455,
     468,
     521,
     527,
     537,
     568,
     569,
     575,
     615,
     616,
     618,
     625,
     627,
     638,
     663,
     676,
     687,
     697,
     701,
     708,
     717,
     726,
     735,
     742,
     744,
     755,
     784,
     789,
     808,
     825,
     841,
     852,
     853,
     858,
     865,
     867,
     875,
     879,
     880,
     882,
     883,
     886,
     888,
     898,
     903,
     911,
     937,
     939,
 

In [14]:
# Given a filename, extract just the book id
def get_book_id(fn):
    return os.path.split(fn)[1].split('_')[0].split('.')[0]

In [16]:
book_ids = {get_book_id(fn) for fn in glob.glob(os.path.join('plain', '*.txt'))}

In [17]:
list(book_ids)[:5]

['36105213320950',
 '36105213320620',
 '36105213329217',
 '36105213332914',
 '36105213324994']

In [None]:
# instead of directly using glob in the loop to create the texts, use the book_ids list created from the set so that we 
# can match index in id list and text list