In [1]:
import os

data_dir = "/dev/data/gut/"

files = os.listdir(data_dir)
files = [x for x in files if x.endswith(".txt") and "__" in x]

In [2]:
texts = []
labels = []

for fn in files:
    with open(os.path.join(data_dir, fn), encoding="ISO-8859-1") as f:
        s = f.read()
    texts.append(s[1000:])
    labels.append(fn[:-4])

In [3]:
shortest = sorted([(len(text), i) for i, text in enumerate(texts)])[:36]

In [4]:
shortest_is = set([s[1] for s in shortest])

In [5]:
texts = [x for i, x in enumerate(texts) if i not in shortest_is]
labels = [x for i, x in enumerate(labels) if i not in shortest_is]

In [6]:
def get_chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [9]:
text_fragments = []
label_fragments = []

for i, text in enumerate(texts):
    num_added = 0
    chunks = get_chunks(text, 1000)
    for chunk in chunks:
        #if num_added > 100:
        #    continue
        num_added += 1
        text_fragments.append(chunk)
        label_fragments.append(labels[i])
        

In [10]:
len(text_fragments)

1185868

In [11]:
import numpy as np
from random import shuffle

indices = list(range(len(text_fragments)))
shuffle(indices)

In [12]:
text_fragments = np.array(text_fragments)
label_fragments = np.array(label_fragments)

In [13]:
text_fragments = text_fragments[indices]
label_fragments = label_fragments[indices]

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
%%time
vectorizer = TfidfVectorizer(analyzer='char', min_df=10, ngram_range=(1,3))

vectors = vectorizer.fit_transform(text_fragments)

CPU times: user 33min 7s, sys: 41.2 s, total: 33min 48s
Wall time: 33min 48s


In [18]:
vectors.shape

(1185868, 42854)

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

nb = MultinomialNB(fit_prior=False, alpha=0.001)
# svm = SVC(kernel='linear', probability=True)

X_train, X_test, y_train, y_test = train_test_split(vectors, label_fragments, test_size=0.5)

In [25]:
%%time
nb.fit(X_train, y_train)

CPU times: user 23min 52s, sys: 10.9 s, total: 24min 3s
Wall time: 24min 3s


MultinomialNB(alpha=0.001, class_prior=None, fit_prior=False)

In [26]:
%%time
from sklearn.metrics import accuracy_score
preds = nb.predict(X_test)

print(accuracy_score(y_test, preds))

0.644078430314
CPU times: user 21min 47s, sys: 14.3 s, total: 22min 1s
Wall time: 22min 1s


In [None]:
preds[:10]

In [None]:
y_test[:10]

In [27]:
from sklearn.externals import joblib
joblib.dump(vectorizer,"/dev/data/vec.pickle")

['/dev/data/vec.pickle']

In [58]:
joblib.dump(nb,"/dev/data/nb_author.pickle")

['/dev/data/nb_author.pickle']

In [29]:
y_train[:10]

array(['James Fenimore Cooper___The Chainbearer',
       'George Alfred Henty___Held Fast For England',
       'Mark Twain___Tom Sawyer Abroad',
       'James Fenimore Cooper___The Deerslayer',
       "Robert Louis Stevenson___A Child's Garden of Verses, Verse 130",
       'Rudyard Kipling___Letters of Travel (1892-1913)',
       'Edgar Rice Burroughs___The People that Time Forgot',
       'George Alfred Henty___Through Russian Snows',
       'Bret Harte___Clarence', 'Andrew Lang___Adventures among Books'], 
      dtype='<U176')

In [30]:
y_train_authors = [y.split("__")[0] for y in y_train]

In [31]:
y_test_authors = [y.split("__")[0] for y in y_test]

In [40]:
%%time
nb = MultinomialNB(alpha=0.00001, fit_prior=False)
nb.fit(X_train, y_train_authors)

CPU times: user 1min 18s, sys: 668 ms, total: 1min 19s
Wall time: 1min 19s


In [41]:
%%time
from sklearn.metrics import accuracy_score
preds = nb.predict(X_test)
print(accuracy_score(y_test_authors, preds))

0.60590217461
CPU times: user 1min 18s, sys: 856 ms, total: 1min 19s
Wall time: 1min 19s


In [42]:
probs = nb.predict_proba(X_test[:100])

In [43]:
probs

array([[  4.97274144e-03,   4.90674204e-04,   8.62198136e-06, ...,
          2.45726009e-04,   1.19921565e-02,   2.81849550e-03],
       [  7.36012931e-03,   1.65195559e-04,   8.00881342e-06, ...,
          2.31234700e-04,   1.01864427e-03,   2.11504229e-03],
       [  1.73113690e-12,   4.75216414e-14,   1.53592070e-15, ...,
          3.54175881e-13,   1.05191271e-15,   8.65668152e-11],
       ..., 
       [  1.90269805e-08,   4.50320873e-10,   5.93732120e-12, ...,
          2.75332980e-11,   1.45413891e-10,   9.98293174e-10],
       [  5.15162188e-04,   1.07883640e-03,   2.39489975e-08, ...,
          9.38622484e-04,   9.19465945e-03,   2.29554923e-03],
       [  6.44355436e-04,   3.41505664e-03,   9.04901957e-08, ...,
          2.25966051e-03,   1.61112192e-02,   1.85214379e-02]])

In [44]:
probs[0].shape

(142,)

In [46]:
len(set(y_test_authors))

142

0.14869105294563928