In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [0]:
pets = pd.read_csv("https://raw.githubusercontent.com/rtrvale/datasets/master/pets_seattle_toronto.csv")

In [19]:
pets['city'].value_counts()

Seattle    49191
Toronto    19377
Name: city, dtype: int64

In [20]:
pets.head()

Unnamed: 0.1,Unnamed: 0,name,species,year,city,count
0,0,,dog,2000,Seattle,1.0
1,1,FANCY,dog,2000,Seattle,1.0
2,2,SKIP,dog,2000,Seattle,1.0
3,3,KANGA,dog,2000,Seattle,1.0
4,4,OSCAR,dog,2000,Seattle,1.0


In [0]:
# train using the Seattle pet name data
seattle_names = pets[pets['city'] == 'Seattle'].name

In [0]:
seattle_species = pets[pets['city'] == 'Seattle'].species

In [0]:
# remove a NaN value - note that it has to be done in this order
seattle_species = seattle_species[pd.notna(seattle_names)]
seattle_names = seattle_names[pd.notna(seattle_names)]

seattle_species = seattle_species.reset_index()['species']
seattle_names = seattle_names.reset_index()['name']

In [0]:
# build an encoder
import tensorflow_datasets as tfds
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    seattle_names, target_vocab_size=1000)

In [25]:
encoder.encode('REX')

[24, 840]

In [26]:
encoder.decode(encoder.encode(seattle_names[10]))

'ZOEY'

In [27]:
encoder.vocab_size

1008

In [0]:
input_data = []
for i in range(len(seattle_names)):
  input_data += [encoder.encode(seattle_names[i])]

In [0]:
# padding the input data

np.max([len(x) for x in input_data])
# 22

for x in input_data:
  if len(x) < 22:
    x += [0]*(22 - len(x))

In [0]:
input_data = np.array(input_data).reshape((len(input_data), 22))

In [0]:
# making the y-variable

seattle_species.head()
y = (seattle_species == "cat")

In [32]:
y.value_counts()

False    33078
True     15824
Name: species, dtype: int64

In [0]:
dog_sample = np.random.choice(np.where(~y)[0], 15824)
cat_sample = np.where(y)[0]

In [0]:
yb = pd.concat([y[dog_sample], y[cat_sample]])
y_raw = yb

In [0]:
X = input_data[yb.index]
X_raw = [encoder.decode(X[i]) for i in range(X.shape[0])]

In [36]:
yb.value_counts()

True     15824
False    15824
Name: species, dtype: int64

In [0]:
yb = tf.keras.utils.to_categorical(yb)
yb = yb.reshape((len(yb), 2))

In [38]:
X.shape

(31648, 22)

In [39]:
yb.shape

(31648, 2)

In [0]:
shuff = np.random.permutation(np.arange(yb.shape[0]))

In [0]:
X = X[shuff, :]
yb = yb[shuff, :]

In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64, input_length=22),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [43]:
model.fit(X, yb, epochs=5, validation_split = 0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd66f639278>

In [44]:
# how to make a prediction
enc = encoder.encode("FIDO")
enc = enc + [0]*(22-len(enc))
model.predict(np.array(enc).reshape((1,22)))

array([[0.02130414, 0.97869587]], dtype=float32)

In [0]:
pred_cat = [x[0]>0.5 for x in model.predict(X[:1000])]
cat = [x[0]>0.5 for x in yb[:1000]]

In [46]:
pd.crosstab(pd.Series(pred_cat), pd.Series(cat))

col_0,False,True
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,307,201
True,161,331


In [47]:
tab = pd.crosstab(pd.Series(pred_cat), pd.Series(cat))
(tab[0][0] + tab[1][1])/tab.sum().sum()

0.638

In [48]:
!pip install python-Levenshtein

Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |██████▊                         | 10kB 33.1MB/s eta 0:00:01[K     |█████████████▌                  | 20kB 3.1MB/s eta 0:00:01[K     |████████████████████▏           | 30kB 3.7MB/s eta 0:00:01[K     |███████████████████████████     | 40kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.7MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.0-cp36-cp36m-linux_x86_64.whl size=144790 sha256=b17340d0501e67bfbe3bad8622820357ec4824edab746eb3cffd7d5ffc8f4aa8
  Stored in directory: /root/.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-Levenshtein
Installin

In [0]:
from Levenshtein import distance

In [0]:
def nb_classifier(strings, cat):
  # set up dictionary to contain counts
  nb_dict = {}

  for i in range(len(strings)):
    # split string into tokens
    words = strings[i].split(" ")
  
    for word in words:
      word = word.strip()
      # if word does not occur, add it
      if word not in nb_dict:
        nb_dict[word] = [0, 0]
      # cat[i] = 0 if ith name is a dog, else 1
      # nb_dict[word] = [cat count, dog count], a list of length 2
      nb_dict[word][cat[i]] += 1
      
  return nb_dict

def classify(name, nb_dict):
  words = name.split(" ")

  # initialize outputs
  cat_prob = 1
  dog_prob = 1

  # get total numbers of cats and dogs
  total_cats = np.array([nb_dict[k][1] for k in nb_dict.keys()]).sum()
  total_dogs = np.array([nb_dict[k][0] for k in nb_dict.keys()]).sum()

  for word in words:
    
    cats = 0
    dogs = 0
    # convert word to upper case with no spaces
    word = word.strip().upper()
    # keep a record of which words are the closest
    min_dist = distance(word, list(nb_dict.keys())[0])
    for k in nb_dict.keys():
      dist = distance(word, k)
      # if k was closer than current closest word, use k instead
      if dist < min_dist:
        cats = nb_dict[k][1]
        dogs = nb_dict[k][0]
        min_dist = dist
        # if exact match, no need to search further
        if dist == 0:
          break
      # if k was as close as the current closest word, add the
      # counts of cats and dogs from word k to current totals
      elif dist == min_dist:
        cats += nb_dict[k][1]
        dogs += nb_dict[k][0]
    # calculate naive Bayes probabilities by multiplying
    cat_prob *= (cats + 1)/(total_cats + 1)
    dog_prob *= (dogs + 1)/(total_dogs + 1)
  return (cat_prob, dog_prob)


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.1)

In [0]:
nb1 = nb_classifier(list(X_train), list(y_train))

In [0]:
pred = []
for i in range(len(list(X_test))):
  pred += [classify(list(X_test)[i], nb1)]

In [54]:
predClass = pd.Series([x[0] > x[1] for x in pred])
predClass.index = y_test.index # re-indexing is necessary for pandas
tab = pd.crosstab(predClass, pd.Series(y_test))
(tab[0][0] + tab[1][1])/tab.sum().sum()

0.6189573459715639

In [55]:
tab

species,False,True
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1119,726
True,480,840


In [56]:
classify('REX', nb1)

(0.0004480860325182435, 0.0011731734341393469)

In [57]:
classify('CAT', nb1)

(0.002944565356548457, 6.517630189663039e-05)

In [0]:
def rnn_classifier(strings, cat):
  encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    strings, target_vocab_size=1000)
  
  # encode the strings
  encoded = [encoder.encode(x) for x in strings]
  max_encode_length = np.max([len(x) for x in encoded])
  for i in range(len(encoded)):
    encoded[i] += [0]*(max_encode_length - len(encoded[i]))

  X = np.array(encoded).reshape((len(encoded), max_encode_length))
  y = tf.keras.utils.to_categorical(cat)
  y = y.reshape((len(cat), 2))
 
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64, input_length=max_encode_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
  ])

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
  
  model.fit(X, y, epochs=3)
  return {'model':model, 'encoder':encoder, 'M':max_encode_length}

def classify_rnn(name, rnn):
  encoded = rnn['encoder'].encode(name)
  encoded += [0]*(rnn['M'] - len(encoded))
  pred = rnn['model'].predict(np.array(encoded).reshape((1, len(encoded))))
  return pred[0][1]

In [59]:
rnn = rnn_classifier(X_train, y_train)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [60]:
rnn

{'M': 21,
 'encoder': <SubwordTextEncoder vocab_size=1010>,
 'model': <tensorflow.python.keras.engine.sequential.Sequential at 0x7fd648244ef0>}

In [65]:
classify('REX', nb1)

(0.0004480860325182435, 0.0011731734341393469)

In [66]:
classify_rnn('REX', rnn)

0.0090955645

In [63]:
pred = []
for i in range(len(X_test)):
  pred += [classify_rnn(list(X_test)[i], rnn)]

predClass = pd.Series([x > 0.5 for x in pred])
predClass.index = y_test.index
tab = pd.crosstab(predClass, pd.Series(y_test))
(tab[0][0] + tab[1][1])/tab.sum().sum()

0.608214849921011

In [64]:
tab

species,False,True
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1034,675
True,565,891
