## Stackoverflow classifier

[Github Repo](https://github.com/GoogleCloudPlatform/ai-platform-text-classifier-shap/blob/master/stackoverflow-classifier.ipynb)

In [83]:
#!pip install tensorflow==1.13.1

In [1]:
import tensorflow as tf 
import pandas as pd
import numpy as np 

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
- [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [4]:
!ls

SO_ml_tags_avocado_188k_v2.csv	text-classifier.ipynb  tutorials


In [13]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv', names=['tags', 'original_tags', 'text'], header=0)
data = data.dropna()

In [14]:
data = data.drop(columns=['original_tags'])

In [15]:
#get rid of any order inherited from the table
data = shuffle(data, random_state = 22)

In [16]:
data.head()

Unnamed: 0,tags,text
182914,"tensorflow,keras",avocado image captioning model not compiling b...
48361,pandas,return excel file from avocado with flask in f...
181447,"tensorflow,keras",validating with generator (avocado) i'm trying...
66307,pandas,avocado multiindex dataframe selecting data gi...
11283,pandas,get rightmost non-zero value position for each...


In [17]:
data.iloc[0].text

'avocado image captioning model not compiling because of concatenate layer when mask_zero=true in a previous layer i am new to avocado and i am trying to implement a model for an image captioning project.   i am trying to reproduce the model from image captioning pre-inject architecture (the picture is taken from this paper: where to put the image in an image captioning generator) (but with a minor difference: generating a word at each time step instead of only generating a single word at the end), in which the inputs for the lstm at the first time step are the embedded cnn features. the lstm should support variable input length and in order to do this i padded all the sequences with zeros so that all of them have maxlen time steps.  the code for the model i have right now is the following:    def get_model(model_name, batch_size, maxlen, voc_size, embed_size,          cnn_feats_size, dropout_rate):      # create input layer for the cnn features     cnn_feats_input = input(shape=(cnn_f

## Feature Engineering

In [18]:
# Encode top tags to multi-hot
tags_split = [tags.split(',') for tags in data['tags'].values]
print(tags_split[0] )

['tensorflow', 'keras']


In [76]:
# One hot encoding
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
num_tags = len(tags_encoded[0])
print(tag_encoder.classes_)

['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']


In [21]:
#label vector of the first row
tags_encoded[0]

array([1, 0, 0, 0, 1])

## Modeling

In [24]:
# Split our data into train and test sets from the label tags
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 150559
Test size: 37640


In [25]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [26]:
train_tags

array([[1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1],
       ...,
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1]])



### **Creating a class to import in the future**
[Keras preprocessing text method](https://keras.io/preprocessing/text/)

In [58]:
%%writefile preprocess.py

# Pre-processing data: create our tokenizer class
from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None
  
  def create_tokenizer(self, text_list):
    """
    This class allows to vectorize a text corpus, by turning each text into either a sequence of 
    integers (each integer being the index of a token in a dictionary) or into a vector where the 
    coefficient for each token could be binary, based on word count, based on tf-idf.
    """
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

Overwriting preprocess.py


In [59]:
# Create vocab from training corpus
from preprocess import TextPreprocessor

VOCAB_SIZE = 400 # This is a hyperparameter, try out different values for your dataset

In [65]:
#creating the train/test split
train_qs = data['text'].values[:train_size]
test_qs = data['text'].values[train_size:]

In [66]:
#initializing the class
processor = TextPreprocessor(VOCAB_SIZE)

In [67]:
#creating the matrix with the words size and the corpus of train qs
processor.create_tokenizer(train_qs)

In [68]:
#Creating the bag of words
body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [69]:
#print the size of the matrix & the first vector of the corpus in train
print(len(body_train[0]))
print(body_train[0])

400
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0

## Build and train our model

In [70]:
# Save the processor state of the tokenizer
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

In [74]:
# defining the neural net 

def create_model(vocab_size, num_tags):
    
    #Model groups layers into an object with training and inference features.
    model = tf.keras.models.Sequential()
    
    #Input shape = sizeof our matrix vector bag of words
    model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
    #A hidden layer to 25 nodes
    model.add(tf.keras.layers.Dense(25, activation='relu'))
    #Output layer to the number of tags that we want to predict
    model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [77]:
model = create_model(VOCAB_SIZE, num_tags)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                20050     
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 130       
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________


In [80]:
# Train

#_train = input bag of words's array
#_tags 
#epochs =  times where the model will iterate through the entire 
#batch size = how many elements the model will look at a time to update weights
#validation split = validation size 

model.fit(body_train, train_tags, epochs=3, batch_size=128, validation_split=0.1)


Train on 135503 samples, validate on 15056 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7ffb32c7fe10>

In [81]:
print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_tags, batch_size=128)))

print('Eval loss/accuracy:{}'.format(model.evaluate(body_test, test_tags, batch_size=128)))

Eval loss/accuracy:[0.10022197399648672, 0.96041423]
Eval loss/accuracy:[0.10022197399648672, 0.96041423]


In [82]:
# Export the model to a file
model.save('keras_saved_model.h5')

## Test our model (locally)