# Multi-label Patent Classification with Neural Networks

In [None]:
# import packages 
import tensorflow as tf
import pandas as pd  
import numpy as np 
import gzip
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Dropout, Input, Embedding, Reshape, Flatten, Conv1D, Conv2D, MaxPool2D, GlobalMaxPool1D, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras import utils
from keras import optimizers, models
from sklearn import model_selection, preprocessing, metrics
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import zipfile
import keras_metrics
import re
import nltk
from textblob import Word


%matplotlib inline

### Loading the Data

Dataset: 

https://www.google.com/googlebooks/uspto-patents-grants-text.html#2015

The parsing and preprocessing of the patent files can be found here: 

https://github.com/cpapadimitriou/W266-Final-Project/blob/master/preparation/parse_xml.py

In [None]:
# load data 

# features 
current_dir = %pwd
abstract = pd.read_table(current_dir+'/out_zipped/docNumberToAbsText.txt.gz',compression='gzip', header=None)
abstract = abstract[0].str.split('|', expand=True).rename(columns={0:'doc_num',1:'abstract'})

claim = pd.read_table(current_dir+'/out_zipped/docNumberToClaimText.txt.gz',compression='gzip',  header=None)
claim = claim[0].str.split('|', expand=True).rename(columns={0:'doc_num',1:'claim'})

desc = pd.read_table(current_dir+'/out_zipped/docNumberToDescText.txt.gz',compression='gzip', header=None)
desc = desc[0].str.split('|', expand=True).rename(columns={0:'doc_num',1:'desc'})

title = pd.read_table(current_dir+'/out_zipped/docNumberToInvTitle.txt.gz',compression='gzip', header=None)
title = title[0].str.split('|', expand=True).rename(columns={0:'doc_num',1:'title'})

# file_name = pd.read_table(current_dir+'/out_zipped/fileNameToDocNumber.txt.gz',compression='gzip', header=None)
# file_name = file_name[0].str.split('|', expand=True).rename(columns={0:'file_name',1:'doc_num'})

In [None]:
# labels 
# label_names = pd.read_table(current_dir+'/out_zipped/docNumberToLabelSubClass.txt.gz',compression='gzip', header=None)
# label_names = labels[0].str.split('|', expand=True).rename(columns={0:'doc_num'})

labels = pd.read_table(current_dir+'/out_zipped/docNumberToLabelSubClassCode.txt.gz',compression='gzip', header=None)
labels = labels[0].str.split('|', expand=True).rename(columns={0:'doc_num'})

### Data Preparation

In [None]:
# joining the datasets
X = pd.concat([title.set_index('doc_num'), 
           abstract.set_index('doc_num'), 
           claim.set_index('doc_num'), 
           desc.set_index('doc_num')], axis=1).sort_index() #  join='inner'

Y = labels.set_index('doc_num').sort_index() #.set_index(X.index) # fixing the index mismatch
Y.columns=["label{}".format(i) for i in range(1,9)] # renaming columns 

print(X.shape)
print(Y.shape)

In [None]:
# data cleaning 
assert Y['label1'].isnull().sum() == 0 # there is no document with 

print("{} documents with null title".format(X['title'].isnull().sum()))
print("{} documents with null claims".format(X['claim'].isnull().sum()))
print("{} documents with null abstract".format(X['abstract'].isnull().sum()))
print("{} documents with null description".format(X['desc'].isnull().sum()))

# remove documents with null sections (title and abstract)
X_clean = X.dropna(how='any')
null_idx = X[~X.index.isin(X_clean.index)].index # storing the removed indices (i.e. document numbers)
assert X.shape[0] - null_idx.shape[0] == X_clean.shape[0] # making sure the row counts match

# removing the documents with null sections from the labels as well 
Y_clean = Y.loc[X_clean.index]

# some checks
assert X_clean.shape[0] == Y_clean.shape[0]
assert ((Y_clean.index == X_clean.index)*1).sum() == X_clean.shape[0]

# lower-casing everything
X_clean = X_clean.apply(lambda x: x.str.lower())
Y_clean = Y_clean.apply(lambda x: x.str.lower())

In [None]:
# Creating dataset 
data = pd.DataFrame()

# concat the text of all patent sections and join the labels
data['full_text'] = X_clean['title'] + " " + X_clean['claim'] + " " + X_clean['abstract'] + " " + X_clean['desc']

# using only the first label  
data['label1'] = Y_clean['label1'] 

# using all labels, merging them in a list and removing None values
data['labels'] = Y_clean.values.tolist()
data['labels'] = data['labels'].apply(lambda x: list(filter(None, x)))

#### Note: we can potentially remove documents with less than 600 words

In [None]:
# filtering out documents with fewer words 
data["doc_lenghts"] = data.full_text.str.split().apply(lambda x: len(x))

data[data["doc_lenghts"]<600].shape

### Encoding Labels with MultiLabelBinarizer

In [None]:
# converting labels into a binarized matrix with the labels as columns 
# and each patent document represented in one row
mlb = preprocessing.MultiLabelBinarizer()
labels = mlb.fit_transform(data['labels'])

# checking that the conversion worked as desired by verifying the counts of labels for each document prior and post 
assert data['labels'].apply(lambda x: len(x)).values.sum() == np.array(pd.DataFrame(labels).apply(lambda x: x.sum(),axis=1)).sum()

In [None]:
# labels_df.iloc[:,[538]].sum()   #470

In [None]:
labels_df = pd.DataFrame(labels, columns=mlb.classes_)  #.apply(lambda x: x.sum(),axis=1)
labels_df.head()

In [None]:
# checking how many times does each label (i.e. class) appear in the data 
# we observe that the classes are a little unbalanced 
pl = labels_df.apply(lambda x: x.sum(),axis=0).sort_values(ascending=False)  #.plot.bar()
pl

### Advanced text processing and TF_IDF 
Take the `data['full_text']` and for each patent return an observation with: 
- 600 most frequent words based on word frequency/tf_idf algorithm (make sure you do checks to ensure it is working properly
- remove punctuation 
- lowercasing (happened earlier no need to do it here)
- remove stop words 
- romove common / rare words
- lemmatization or stemming
- removing common words
- remove digits (TO DO) 

In [None]:
data['full_text'].head()

### 1. Advanced Processing

In [None]:
# removing punctuation 
data['full_text_proc'] = data['full_text'].str.replace('[^\w\s]','')
data['full_text_proc'].head()

In [None]:
# lower - casing 
#data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
#[201808012200 PT]
# lower - casing 
data['full_text_proc'] = data['full_text_proc'].map(lambda x: x.lower())

In [None]:
#[201808012200 PT]
import nltk
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')
pattern_stop = r'\b(?:{})\b'.format('|'.join(stop))


In [None]:
#[201808012200 PT]
data['full_text_proc_no_stop'] = data['full_text_proc'].str.replace(pattern_stop, '')
data['full_text_proc_no_stop'] = data['full_text_proc_no_stop'].str.replace(r'\s+', ' ')

#data['full_text_proc_no_stop'] = data['full_text_proc'].replace(to_replace=pattern_stop, value="",regex=True)
#data['full_text_proc_no_stop'] = data['full_text_proc_no_stop'].str.replace(r'\s+', ' ')

In [None]:
#[201808012200 PT]
data['full_text_proc'] = data['full_text_proc_no_stop']


In [None]:
#[201808012200 PT] Handled in above cell already
# removing stop words 
# import nltk
# nltk.download('stopwords')
# stop = nltk.corpus.stopwords.words('english')
# data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# data['full_text_proc'].head()

In [None]:
# Common word removal 
# we can remove common words as their presence will not be of any use for our classification problem 
freq_words = pd.Series(' '.join(data['full_text_proc']).split()).value_counts()[:10] # chose the number here 
freq_words

In [None]:
#[201808012200 PT]
freq_words = list(freq_words.index)
pattern_freq_words = r'\b(?:{})\b'.format('|'.join(freq_words))


In [None]:
#[201808012200 PT]
data['full_text_proc'] = data['full_text_proc'].replace(to_replace=pattern_freq_words, value="",regex=True)
data['full_text_proc'] = data['full_text_proc'].str.replace(r'\s+', ' ')

In [None]:
#[201808012200 PT] Handled in above cell already
# Decide if we want to remove these
# freq_words = list(freq_words.index)
# data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_words))
# data['full_text_proc'].head()

In [None]:
# Rare word removal 
# Similarly, just as we removed the most common words, this time let’s remove rarely occurring words from the text. 
# Because they’re so rare, the association between them and other words is dominated by noise. 
rare_words = pd.Series(' '.join(data['full_text_proc']).split()).value_counts()[-10:]
rare_words

In [None]:
#[201808012200 PT]
rare_words = list(rare_words.index)
pattern_rare_words = r'\b(?:{})\b'.format('|'.join(rare_words))


In [None]:
#[201808012200 PT]
data['full_text_proc'] = data['full_text_proc'].replace(to_replace=pattern_rare_words, value="",regex=True)
data['full_text_proc'] = data['full_text_proc'].str.replace(r'\s+', ' ')

In [None]:
#[201808012200 PT] Handled in above cell already
# rare_words = list(rare_words.index)
# data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_words))

In [None]:
data['full_text_proc'].head()

In [None]:
# Spelling correction (this will take a while to run - lets think if we need it. Maybe fo the foreign language words?)
# Maybe there is another library to remove foreign language text? 

# from textblob import TextBlob
# data['full_text_proc'] = data['full_text_proc'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [None]:
# Stemming: removal of suffices, like “ing”, “ly”, “s”, etc.
# from nltk.stem import PorterStemmer
# st = PorterStemmer()
# data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [None]:
# Lemmatization: converts the word into its root word, rather than just stripping the suffices.
# use this instead of stemming 
import nltk
nltk.download('wordnet')

from textblob import Word
data['full_text_proc'] = data['full_text_proc'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['full_text_proc'].head()

In [None]:
# Remove Digits 
data['full_text_proc'] = data['full_text_proc'].apply(lambda x : re.sub("\d+", "", x))

In [None]:
#[201808012200 PT]
#data.to_pickle('./saved_df_zipped/data_after_lemmatization.pkl.gz', compression='gzip')


In [None]:
#[201808012200 PT]
#data = pd.DataFrame()
#data = pd.read_pickle("./saved_df_zipped/data_after_lemmatization.pkl.gz", compression='gzip')
#data.info()
#data.head

### 2. TF-IDF (pending)

In [None]:
##https://medium.com/@acrosson/summarize-documents-using-tf-idf-bdee8f60b71

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [None]:
max_length = 600

In [None]:
#[201808012200 PT]
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
#[201808012200 PT]
count_vect = CountVectorizer(lowercase=True, analyzer='word',   
                             stop_words='english', ngram_range=(1,1))
count_vect = count_vect.fit(data['full_text_proc'])
freq_term_matrix = count_vect.transform(data['full_text_proc'])


In [None]:
#[201808012200 PT]
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(smooth_idf=False,sublinear_tf=False)
tfidf.fit(freq_term_matrix)

In [None]:
#[201808012200 PT]
doc_freq_term = count_vect.transform(data['full_text_proc'])
doc_tfidf_matrix = tfidf.transform(doc_freq_term)


In [None]:
#[201808012200 PT]
feature_names = np.array(count_vect.get_feature_names())
feature_names.shape

In [None]:
#[201808012200 PT]
doc_tfidf_matrix.shape

In [None]:
#[201808012200 PT]
## This step has to be repeated in chunks till 78372 then appended into a single array

doc_tfidf_matrix_dense_arr=doc_tfidf_matrix[range(0,1000),:].toarray()

In [None]:
#[201808012200 PT]
#Append command
#doc_tfidf_matrix_dense=np.append(doc_tfidf_matrix_dense_tmp1,doc_tfidf_matrix_dense_tmp2,axis=0)doc_tfidf_matrix_dense

In [None]:
#[201808012200 PT]
test_words = feature_names[np.argsort(doc_tfidf_matrix_dense)][:,-max_length:]
test_words.shape


In [None]:
#[201808012200 PT]
z = []
for i,t in (enumerate(list(test_words))):
    a = (set(t))
    b = (set(re.sub("[^\w]", " ",  (data['full_text_proc'][i])).split()))
    z.append(list(a.intersection(b)))

z

In [None]:
#[201808012200 PT]
data['full_text_proc_final'] = pd.Series(z, index=data.index).apply(lambda x : " ".join(x))

In [None]:
# TF-IDF to eliminate word count
# We can also perform basic pre-processing steps like lower-casing and removal of stopwords (but we did this earlier)

tfidf = TfidfVectorizer(lowercase=True, analyzer='word', smooth_idf=False, sublinear_tf=False, norm=None,
                        stop_words='english', ngram_range=(1,1)) 

text_transformed = tfidf.fit_transform(data['full_text_proc']).toarray() # CHANGE THIS TO ALL DOCS

feature_names = np.array(tfidf.get_feature_names())
test_words = feature_names[np.argsort(text_transformed)][:,-max_length:]

z = []
for i,t in (enumerate(list(test_words))):
    a = (set(t))
    b = (set(re.sub("[^\w]", " ",  (data['full_text_proc'][i])).split()))
    z.append(list(a.intersection(b)))

data['full_text_proc_final'] = pd.Series(z, index=data.index).apply(lambda x : " ".join(x))

### Train Test Split

In [None]:
# train test split 
# you can change to data['label1'] to include only the first label 
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(data['full_text_proc_final'], labels, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
X_train.head()

In [None]:
# print("Avg. number of labels in train set: {}".format(Y_train.apply(lambda x: len(x)).mean()))
# print("Avg. number of labels in test set: {}".format(Y_test.apply(lambda x: len(x)).mean()))

### Word Embeddings

**Load the pre-trained embeddings using this command:**

`nohup curl -O https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip > curl.nohup.out 2>&1 &`

Check loading progress with this: `cat curl.nohup.out`

Unzipping the file with python: 

In [None]:
current_dir

In [None]:
# sequence length is set to 600 (we will choose the 600 most frequent words in each document)
sequence_length = 600
embedding_dim = 300

In [None]:
# word-embeddings: representing documents using a dense vector representation
# Word embeddings can be trained using the input corpus itself or 
# can be generated using pre-trained word embeddings such as Glove, FastText, and Word2Vec

# step 1. Loading the pretrained word embeddings

df_tmp1=pd.read_table('../wiki-news-300d-1M.vec.zip', compression='zip', sep='\s+', header=None, engine='python', skiprows=1)
columns=['word', 'vector']
df_tmp2=pd.DataFrame(columns=columns)
df_tmp2['word']=df_tmp1[df_tmp1.columns[0]]
df_tmp2['vector']=np.asarray(df_tmp1[df_tmp1.columns[1:]], dtype='float32').tolist()
embeddings_index=pd.Series(df_tmp2.vector.values, index=df_tmp2.word).to_dict()

Link to Tokenizer keras object: https://keras.io/preprocessing/text/

This takes care of:
- num of words to keep based on frequency (`num_words`).
- filtering out punctuation: The **default** is all punctuation, plus tabs and line breaks, minus the ' character.
- lower-casing: convert the texts to lowercase.

In [None]:
# step 2. Creating a tokenizer object using Keras preprocessing object
# the tokenizer has a default filter that removes all punctuation, plus tabs and line breaks, minus the ' character.
token = text.Tokenizer(lower=True) # num_words=sequence_length
token.fit_on_texts(data['full_text'])
word_index = token.word_index

In [None]:
#word_index

In [None]:
print('Number of words in our vocabulary: {}'.format(len(word_index.keys())))

In [None]:
# step 3. Transforming text documents to sequence of tokens and padding them to ensure equal length vectors
# choosing the median document length as max length for padding 
X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=sequence_length)
X_test_seq = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=sequence_length)

In [None]:
# checking out what this did to the first patent
X_train_seq[1]

In [None]:
print(X_train_seq.shape)
print(X_test_seq.shape)

In [None]:
# step 4. Creating a mapping of tokens and their respective embeddings
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

### Evaluation Metrics

**True Positives (TP)**: the number of labels predicted by our approach (prediction labels) that
matched the IPC labels (true labels), without taking the exact order into account.

**False Positives (FP)**: the labels predicted by our approach (prediction labels) that do not match the true IPC labels. 

**False Negatives (FN)**: the labels that should have been predicted by our approach, but were not.

**True Negatives (TN)**: the labels that, correctly, were not predicted by our approach. 


$$ Precision = \frac{TP}{TP + FP} = \frac{trueLabels \cap predictionLabels}{predictionLabels} $$

$$ Recall = \frac{TP}{TP + FN} = \frac{trueLabels \cap predictionLabels}{trueLabels} $$

**Precision** shows the ratio of the predicted labels that are true labels.

**Recall** shows the ratio of the true labels that were predicted correctly. 

After calculating the above metrics for each patent document, we calculate the final Precision, Recall and F1-score across all documents as follows: 

$$Precision_{total} = \frac{1}{TotalSamples} \sum_{n=i}^{TotalSamples} Precision_i$$

$$Recall_{total} = \frac{1}{TotalSamples} \sum_{n=i}^{TotalSamples} Recall_i$$

$$F1_{total} = 2* \frac{Precision_{total}*Recall_{total}}{Precision_{total}+Recall_{total}}  $$

In [None]:
import keras.backend as K

def precision(y_true, y_pred): 
    """Precision metric. Only computes a batch-wise average of precision.  
     Computes the precision, a metric for multi-label classification of 
     how many selected items are relevant. 
     """ 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 
    precision = true_positives / (predicted_positives + K.epsilon()) 
    return precision 

def recall(y_true, y_pred): 
    """Recall metric. 
     Only computes a batch-wise average of recall. 
     Computes the recall, a metric for multi-label classification of 
     how many relevant items are selected. 
     """ 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 
    recall = true_positives / (possible_positives + K.epsilon()) 
    return recall 

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Definining Models

**Note:** if we want to train our own embeddings we can try the `embeddings_initializer="uniform"` parameter instead of `weights=[embedding_matrix]` in the embedding layer.

### CNN model

In [None]:
# CNN Model Hyper-parameters 
vocabulary_size = len(word_index) + 1
sequence_length = sequence_length
embedding_dim = embedding_dim
num_filters = 100
#filter_sizes = [3]
kernel_size = 3  

units = Y_train.shape[1]  # we need the output nodes to equal the number of classes (96)

learning_rate = 1e-4

In [None]:
def CNN_model():    
    
    # Input Layer
    input_layer = Input(shape=(sequence_length,), dtype='int32')

    # Word embedding Layer
    embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length, 
                                weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

    # Convolutional Layer
    conv_layer = Conv1D(num_filters, kernel_size=kernel_size, activation="relu")(embedding_layer)

    # Pooling Layer
    pooling_layer = GlobalMaxPool1D()(conv_layer)

    # Output Layers
    output_layer1 = Dense(50, activation="relu")(pooling_layer)
    output_layer1 = Dropout(0.25)(output_layer1)
    output_layer2 = Dense(units= units, activation="sigmoid")(output_layer1) 

    # Compile the model
    # NOTE: we compile the model using binary cross entropy rather than categorical CE, since the goal 
    # is to treat each output labels as an independent Bernoulli distribution 
    model = models.Model(inputs=input_layer, outputs=output_layer2,)
    model.compile(optimizer=optimizers.Adam(lr=learning_rate), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', recall, precision, f1]) # 'f1score', 'precision', 'recall'

    return model 

### LSTM model

In [None]:
# LSTM Model Hyper-parameters 
vocabulary_size = len(word_index) + 1
sequence_length = sequence_length
embedding_dim = embedding_dim
lstm_units = 100
units = Y_train.shape[1] 

learning_rate = 1e-4

In [None]:
def LSTM_model():

    # Input Layer
    input_layer = Input(shape=(sequence_length,), dtype='int32')

    # Word embedding Layer
    embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length, 
                                weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = LSTM(lstm_units)(embedding_layer)

    # Add the output Layers
    output_layer1 = Dense(50, activation="relu")(lstm_layer)
    output_layer1 = Dropout(0.25)(output_layer1)
    output_layer2 = Dense(units= units, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(lr=learning_rate), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', recall, precision, f1]) # 'f1score', 'precision', 'recall'

    return model


## Training

#### Reference: 
https://machinelearningmastery.com/diagnose-overfitting-underfitting-lstm-models/

In [None]:
#[201807310100 PT] Testing fit
batch_size = 100
epochs = 2

# Choosing Model 
model_tmp = LSTM_model()  # CNN_model()
model_tmp.summary()

history_tmp = model_tmp.fit(X_train_seq, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

In [None]:
#[201807310100 PT] Testing fit
history_tmp.history.keys()

In [None]:
#[201807310100 PT] Testing fit
print(history_tmp.history['loss'])
print(history_tmp.history['val_loss'])
plt.plot(history_tmp.history['loss'])
plt.plot(history_tmp.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
#[201807310100 PT] Testing fit
print("train accuracy:{}".format(history_tmp.history['acc']))
print("validation accuracy:{}".format(history_tmp.history['val_acc']))
plt.plot(history_tmp.history['acc'])
plt.plot(history_tmp.history['val_acc'])
plt.title('model train vs validation accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
# Define Training Parameters 
batch_size = 100
epochs = 5

# Choosing Model 
model = LSTM_model()  # CNN_model()
model.summary()

In [None]:
# Training LSTM
history = model.fit(X_train_seq, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

In [None]:
# Define Training Parameters 
batch_size = 100
epochs = 40

# Choosing Model 
model = CNN_model() 
model.summary()

In [None]:
# Training CNN
history = model.fit(X_train_seq, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

In [None]:
# score = model.evaluate(X_test_seq, Y_test, verbose=0)
# print('Test score:', score[0])
# print('Test accuracy:', score[1])

### LSTM Model Results

In [None]:
def get_pred_classes(X_test_seq, number_of_labels = 1):

    preds = np.zeros(shape=model.predict(X_test_seq).shape)
    pred_proba = model.predict(X_test_seq)

    for i in range(pred_proba.shape[0]): 
        idxs = np.argsort(pred_proba[i])[::-1][:number_of_labels]
        preds[i][idxs] = 1
    
    return preds

#### Top 1 label prediction

In [None]:
# average = samples : Calculate metrics for each instance, 
# and find their average (only meaningful for multilabel classification)
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 1)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

#### Top 2 labels prediction

In [None]:
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 2)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

#### Top 10 labels prediction

In [None]:
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 10)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

In [None]:
# Ploting the loss 
#history.history["loss"]

plt.figure(figsize=(15,5))
N = epochs
plt.plot(np.arange(0, N), history.history["loss"], label="train_loss")
plt.plot(np.arange(0, N), history.history["val_loss"], label="val_loss")
# plt.plot(np.arange(0, N), history.history["acc"], label="train_acc")
# plt.plot(np.arange(0, N), history.history["val_acc"], label="val_acc")
# plt.plot(np.arange(0, N), history.history["precision"], label="train_precision")
# plt.plot(np.arange(0, N), history.history["val_precision"], label="val_precision")
# plt.plot(np.arange(0, N), history.history["recall"], label="train_recall")
# plt.plot(np.arange(0, N), history.history["val_recall"], label="val_recall")
# plt.plot(np.arange(0, N), history.history["f1"], label="train_f1")
# plt.plot(np.arange(0, N), history.history["val_f1"], label="val_f1")
plt.title("Training Loss (LSTM)")
plt.xlabel("Epochs")
plt.grid()
#plt.ylabel("Loss/Accuracy")
plt.legend(loc="upper right")

### CNN Model Results

#### Top 1 label prediction

In [None]:
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 1)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

#### Top 2 labels prediction

In [None]:
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 2)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

#### Top 10 labels prediction

In [None]:
pred_classes = get_pred_classes(X_test_seq, number_of_labels = 10)
print("precision:" , metrics.precision_score(Y_test, pred_classes, average = 'samples'))
print("recall:" , metrics.recall_score(Y_test, pred_classes, average = 'samples'))
print("f1:" , metrics.f1_score(Y_test, pred_classes, average = 'samples'))

In [None]:
# Ploting the loss 
#history.history["loss"]

plt.figure(figsize=(15,5))
N = epochs
plt.plot(np.arange(0, N), history.history["loss"], label="train_loss")
plt.plot(np.arange(0, N), history.history["val_loss"], label="val_loss")
# plt.plot(np.arange(0, N), history.history["acc"], label="train_acc")
# plt.plot(np.arange(0, N), history.history["val_acc"], label="val_acc")
# plt.plot(np.arange(0, N), history.history["precision"], label="train_precision")
# plt.plot(np.arange(0, N), history.history["val_precision"], label="val_precision")
# plt.plot(np.arange(0, N), history.history["recall"], label="train_recall")
# plt.plot(np.arange(0, N), history.history["val_recall"], label="val_recall")
# plt.plot(np.arange(0, N), history.history["f1"], label="train_f1")
# plt.plot(np.arange(0, N), history.history["val_f1"], label="val_f1")
plt.title("Training Loss (CNN)")
plt.xlabel("Epochs")
plt.grid()
#plt.ylabel("Loss/Accuracy")
plt.legend(loc="upper right")

### Error Analysis

In [None]:
model.predict(X_test_seq)[100]

In [None]:
for i in range(5):
    print(model.predict(X_test_seq)[i].argmax())
#model.predict(X_test_seq)[4].argmax()