# Scikit-Learn using Bag of Words

We turn text into numerical feature vectors. First, we assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices) and for each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary. Second, we tokenize the data and make a dictionary of feature vectors. Third we find occurences of words and change that to frequency because occurences aren't helpful for longer or shorter documents.

In [None]:
import pandas as pd
# I have it so all files are in same directory as this file
# read the training data
df = pd.read_csv('train.csv')
# df.head()

In [27]:
authors = dict([(auth, idx) for idx, auth in enumerate(df['author'].unique())])
print(authors)
df['author_id'] = df['author'].apply(lambda x: authors[x])
# df.head()

{'EAP': 0, 'HPL': 1, 'MWS': 2}


In [28]:
# split training set into training data set and test data set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text, df.author_id, test_size=0.13, random_state=42)
# X_train.head()

In [29]:
# CountVectorizer does text preprocessing, tokenizing, and filtering of stopwords. It is able to build a dictionary of
# features and transform documents to feature vectors
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(17033, 23791)

In [30]:
count_vect.vocabulary_.get(u'algorithm')

In [6]:
# calculate term frequency and downscale weights for terms that appear over many documents
# tfidf means term frequency, inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(17033, 23791)

In [7]:
# training on the naive bayes classifier with the MultinomialNB which is best for word counts
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [8]:
# to try to predict the outcome on a new document we need to extract the features using almost the same feature 
# extracting chain as before. The difference is that we call transform instead of fit_transform on the transformers, 
# since they have already been fit to the training set
docs_new = ['I am not sure to what limit his knowledge may extend.', 'For I am Iranon, who was a Prince in Aira.']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, df.author))

'I am not sure to what limit his knowledge may extend.' => 0        EAP
1        HPL
2        EAP
3        MWS
4        HPL
5        MWS
6        EAP
7        EAP
8        EAP
9        MWS
10       MWS
11       EAP
12       HPL
13       HPL
14       EAP
15       MWS
16       EAP
17       MWS
18       EAP
19       HPL
20       EAP
21       HPL
22       EAP
23       EAP
24       EAP
25       EAP
26       EAP
27       EAP
28       HPL
29       HPL
        ... 
19549    MWS
19550    EAP
19551    EAP
19552    EAP
19553    EAP
19554    HPL
19555    EAP
19556    EAP
19557    EAP
19558    EAP
19559    HPL
19560    EAP
19561    HPL
19562    EAP
19563    MWS
19564    EAP
19565    EAP
19566    MWS
19567    EAP
19568    EAP
19569    MWS
19570    MWS
19571    HPL
19572    EAP
19573    MWS
19574    EAP
19575    EAP
19576    EAP
19577    EAP
19578    HPL
Name: author, Length: 19579, dtype: object
'For I am Iranon, who was a Prince in Aira.' => 0        EAP
1        HPL
2        EAP
3        MWS
4    

In [9]:
# building a pipeline, vectorizer > transformer > classifier, acts as a compound classifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [10]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
# check the test set for accuracy of the model
import numpy as np
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

0.8142183817753339

In [12]:
# using a linear support vector machine
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-4, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(X_train, y_train)  

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

0.82560879811468968

In [13]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted,
    target_names=df.author))

             precision    recall  f1-score   support

        EAP       0.80      0.86      0.83      1005
        HPL       0.83      0.81      0.82       690
        EAP       0.86      0.80      0.83       851

avg / total       0.83      0.83      0.83      2546



  .format(len(labels), len(target_names))


In [14]:
metrics.confusion_matrix(y_test, predicted)

array([[867,  69,  69],
       [ 95, 558,  37],
       [125,  49, 677]])

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

models = [('MultiNB', MultinomialNB(alpha=0.03)),
          ('Calibrated MultiNB', CalibratedClassifierCV(
              MultinomialNB(alpha=0.03), method='isotonic')),
          ('Calibrated BernoulliNB', CalibratedClassifierCV(
              BernoulliNB(alpha=0.03), method='isotonic')),
          ('Calibrated Huber', CalibratedClassifierCV(
              SGDClassifier(loss='modified_huber', alpha=1e-4,
                            max_iter=10000, tol=1e-4), method='sigmoid')),
          ('Logit', LogisticRegression(C=30))]

train = pd.read_csv('train.csv')
vectorizer=TfidfVectorizer(token_pattern=r'\w{1,}', sublinear_tf=True, ngram_range=(1,2))
clf = VotingClassifier(models, voting='soft', weights=[3,3,3,1,1])
X_train = vectorizer.fit_transform(train.text.values)
authors = ['MWS','EAP','HPL']
y_train = train.author.apply(authors.index).values
clf.fit(X_train, y_train)

test = pd.read_csv('test.csv', index_col=0)
X_test = vectorizer.transform(test.text.values)
results = clf.predict_proba(X_test)
pd.DataFrame(results, index=test.index, columns=authors).to_csv('scikit_results.csv')

# Using Keras for categorization using feature extraction

This model is a bit different in that it deletes rare words to prevent overfitting, it doesn't remove stopwords because an author may use them in unique ways, the same goes for stemming and lowercase, it cuts long sentences and separates punctuation as some punctuation may be unique from author to author. 

In [1]:
import numpy as np

import pandas as pd

from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

np.random.seed(7)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('train.csv')
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [3]:
# checks character distribution per author
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()

c EAP   HPL   MWS   
B 835 533 395 
z 634 529 400 
H 864 741 669 
u 26311 19519 21025 
I 4846 3480 4917 
x 1951 1061 1267 
ê 28 2 0 
Υ 0 1 0 
ἶ 0 2 0 
ü 1 5 0 
M 1065 645 415 
O 414 503 282 
m 22792 17622 20471 
ï 0 7 0 
ô 8 0 0 
? 510 169 419 
" 2987 513 1469 
. 8406 5908 5761 
ñ 0 7 0 
α 0 2 0 
P 442 320 365 
b 13245 10636 9611 
p 17422 10965 12361 
d 36862 33366 35315 
â 6 0 0 
T 2217 1583 1230 
w 17507 15554 16062 
v 9624 6529 7948 
D 491 334 227 
V 156 67 57 
k 4277 5204 3707 
g 16088 14951 12601 
é 47 15 0 
à 10 0 0 
K 86 176 35 
Ο 0 3 0 
Æ 1 4 0 
e 114885 88259 97515 
ö 16 3 0 
G 313 318 246 
ç 1 0 0 
ä 1 6 0 
h 51580 42770 43738 
; 1354 1143 2662 
q 1030 779 677 
E 435 281 445 
Ν 0 1 0 
F 383 269 232 
C 395 439 308 
i 60952 44250 46080 
N 411 345 204 
æ 36 10 0 
Π 0 1 0 
î 1 0 0 
X 17 5 4 
W 739 732 681 
s 53841 43915 45962 
Z 23 51 2 
J 164 210 66 
L 458 249 307 
ë 0 12 0 
Å 0 1 0 
a 68525 56815 55274 
, 17594 8581 12045 
o 67145 50996 53386 
: 176 47 339 
r 51221 40590 44042 

Some authors use unique characters and authors use punctuation differently like EAP uses commas more and MWS uses a lot
less apostrophes so she probably uses a lot less contractions.

In [4]:
# function that separates punctuation from words
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [5]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [6]:
# removes lower frequency words
# cuts documents that have more than 256 words
min_count = 2

docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [7]:
# dimension of a word vector is 
input_dim = np.max(docs) + 1
embedding_dims = 20

In [8]:
# function that creates model with Adam optimizer
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [13]:
# train model
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 6713 samples, validate on 1679 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [11]:
# uses model to predict the author of the test data set
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)

result = pd.read_csv('sample_submission.csv')
for a, i in a2c.items():
    result[a] = y[:, i]



In [12]:
result.to_csv('keras_result.csv', index=False)

# Keras Convolutional Network for Spooky Author ID1

Trying to get Keras to work with both my GTX 1070s

In [2]:
# Definitions

from __future__ import print_function

import os
import sys
import numpy as np

# tensorflow settings to activate gpu
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.optimizers import RMSprop

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


BASE_DIR = '../data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'SpookyData')
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

import tensorflow as tf
# Creates a graph.
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2111008048821374629
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 71761920
locality {
  bus_id: 1
}
incarnation: 2593259007318933880
physical_device_desc: "device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0"
, name: "/gpu:1"
device_type: "GPU"
memory_limit: 7984057549
locality {
  bus_id: 1
}
incarnation: 5962234433982548780
physical_device_desc: "device: 1, name: GeForce GTX 1070, pci bus id: 0000:02:00.0"
]
[[ 22.  28.]
 [ 49.  64.]]


In [3]:
import pandas as pd

# read the training data
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
# get a list of classifications and generate numeric 
#  values for each class.  put the numeric class back 
#  on to the data frame.
authors = dict([(auth, idx) for idx, auth in enumerate(df['author'].unique())])
print(authors)
df['author_id'] = df['author'].apply(lambda x: authors[x])

df.head()

{'EAP': 0, 'HPL': 1, 'MWS': 2}


Unnamed: 0,id,text,author,author_id
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


In [5]:
# Drop stop words
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

# now we will use the text and author_id fields to train a classifier.
#  We have to: 
#  1. Get the sentences, 
sents = df['text'].tolist()
labels = df['author_id'].tolist()
#  2. Tokenize each sentence, 
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(sents)
sequences = tokenizer.texts_to_sequences(sents)
print(len(sequences))
print(sequences[0])
##    Get a vector of unique terms here
print('Found %s unique tokens before stopwords removal.' % len(tokenizer.word_index))
print([w for w in tokenizer.word_index.items()][:5])
word_index = dict([(w,i) for w,i in tokenizer.word_index.items() if w not in stops])
print('Found %s unique tokens after stopwords removal.' % len(word_index))


data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
y_val[:5]

19579
[26, 2945, 143, 1372, 22, 36, 294, 2, 7451, 1, 2440, 2, 10, 4556, 16, 6, 79, 179, 48, 4245, 3, 295, 4, 1, 249, 1943, 6, 326, 74, 134, 123, 891, 2, 1, 313, 39, 1438, 4928, 98, 1, 430]
Found 25943 unique tokens before stopwords removal.
[('the', 1), ('of', 2), ('and', 3), ('to', 4), ('a', 5)]
Found 25808 unique tokens after stopwords removal.
Shape of data tensor: (19579, 1000)
Shape of label tensor: (19579, 3)


array([[ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [6]:
#  3. Load embeddings
embeddings_index = {}
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [7]:
#  4. Create the Embedding matrix for the training set
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
unk = []
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        unk.append(word)
print(len(unk))

2092


In [9]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed

from keras.utils.training_utils import multi_gpu_model

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
#x = MaxPooling1D()(x)
#x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(.5)(x)
preds = Dense(len(authors), activation='softmax')(x)
rms = RMSprop(lr=0.003)
model = Model(sequence_input, preds)

model = multi_gpu_model(model, gpus=2)

model.compile(loss='mean_squared_logarithmic_error',
              optimizer=rms, #'rmsprop',
              metrics=['acc'])
#model.compile(loss='categorical_crossentropy',
#              optimizer=rms, #'rmsprop',
#              metrics=['acc'])

Training model.


ValueError: `multi_gpu_model` is only available with the TensorFlow backend.

In [None]:
model.fit(x_train, y_train,
          batch_size=100,
          epochs=50,
          validation_data=(x_val, y_val))