# Import Packages

In [None]:
import nltk
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score

In [None]:
from keras.models import Sequential,Model
from keras.metrics import categorical_accuracy
from keras import layers, optimizers
from keras.layers import Input, GRU, Dense,LSTM, Dropout, Embedding
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot,Tokenizer
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

# Import pretrained embedding models (automotive industry context)

In [None]:
word2vec_CRs = gensim.models.Word2Vec.load("w2v_CRs")
fasttext_CRs = FastText.load("fasttext_CRs.model")

# Prepare dataset

In [None]:
train_data = pd.read_pickle("trainset-Copy1.txt")
test_data = pd.read_pickle("test_set")

##### Upsampling

In [None]:
df_i = train_data[train_data.label == "I"]
df_g = train_data[train_data.label == "G"]
df_e = train_data[train_data.label == "E"]
df_c = train_data[train_data.label == "C"]
df_d = train_data[train_data.label == "D"]
df_rest = train_data
df_rest = df_rest [df_rest.label != 'I']
df_rest = df_rest[df_rest.label != 'G']
df_rest= df_rest [df_rest.label != 'E']
df_rest= df_rest [df_rest.label != 'C']
df_rest= df_rest [df_rest.label != 'D']

df_i_n = resample(df_i, replace=True, n_samples=1500,random_state = 72)
df_g_n = resample(df_g, replace=True, n_samples=1500,random_state = 72)
df_c_n = resample(df_c, replace=True, n_samples=1500,random_state = 72)
df_e_n = resample(df_e, replace=True, n_samples=1500,random_state = 72)
df_d_n = resample(df_d, replace=True, n_samples=1500,random_state = 72)

train_data = pd.concat([df_i_n,df_g_n,df_e_n,df_c_n,df_d_n,df_rest])

class_counts = train_data.groupby('label').size()

# data distribution histogram

LABELS = ["A", "B", "C", "D", "E", "F", "I", "J", "K", "L", "M"]
plt.xlabel('title of the xlabel' , color = 'black', fontsize='16', horizontalalignment='center')
plt.xticks(color='black', rotation='vertical', fontsize='11', horizontalalignment='right')
class_counts.plot.bar(x = train_data.label, align='center', color=(0.1, 0.2, 0.9, 0.9))

##### Shuffle data

In [None]:
train_data = train_data.sample(frac=1)

#### Remove the mixed class (called class A)

In [None]:
train_data=train_data[train_data.label!="A"]
test_data=test_data[test_data.label!="A"]

#### Convert texts into sequence of words

In [None]:
NUM_WORDS=20000
tokenizer = Tokenizer(num_words=NUM_WORDS,lower=True)
tokenizer.fit_on_texts(train_data.CR_s)
sequences_train = tokenizer.texts_to_sequences(train_data.CR_s)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

#### Save tokenizer for later

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### encode labels

In [None]:
functions=train_data.label.unique()
dic={}
for i,func in enumerate(functions):
    dic[func]=i
labels=train_data.label.apply(lambda x:dic[x])

In [None]:
train_data.loc[train_data['label'] == 'B', 'LABEL'] = 0
train_data.loc[train_data['label'] == 'C', 'LABEL'] = 1
train_data.loc[train_data['label'] == 'D', 'LABEL'] = 2
train_data.loc[train_data['label'] == 'E', 'LABEL'] = 3
train_data.loc[train_data['label'] == 'F', 'LABEL'] = 4
train_data.loc[train_data['label'] == 'G', 'LABEL'] = 5
train_data.loc[train_data['label'] == 'I', 'LABEL'] = 6
train_data.loc[train_data['label'] == 'J', 'LABEL'] = 7

##### pad sequences to the same length

In [None]:
X_train = pad_sequences(sequences_train)
y_train = to_categorical(train_data['LABEL'], num_classes=8)
print('Shape of X train tensor:', X_train.shape)
print('Shape of label train tensor:', y_train.shape)

# Prepare  embedding matrix 

In [None]:
embed=word2vec_CRs
pretrained_weights = embed.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
MAX_NB_WORDS = len(tokenizer.word_index) + 1
MAX_SEQUENCE_LENGTH = X_train.shape[1] 

#####

EMBEDDING_DIM = emdedding_size
nb_words = MAX_NB_WORDS
# we initialize the matrix with random numbers
ft_matrix = (np.random.rand(nb_words, EMBEDDING_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = embed.wv[word]
        # words not found in embedding index will be all-zeros.
        ft_matrix[i] = embedding_vector
    except:
        pass        

# Build GRU model

In [None]:
max_features = len(tokenizer.word_index) + 1
input_dim = X_train.shape[1]  # Number of features
inputs = Input(name='inputs',shape=[input_dim])
layer = Embedding(input_dim=max_features, 
                            output_dim=EMBEDDING_DIM, 
                            weights=[ft_matrix],
                            trainable=False)(inputs)
gru_out = GRU(100,dropout=0.2,recurrent_dropout=0.2, return_sequences=True)(layer)
gru_out = Dropout(0.5)(gru_out)
output = Dense(len(functions), activation='softmax')(gru_out)
model = Model(input=[inputs], output=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

#### Set weights for classes to resolve the imbalanced input (used if we did not do the upsampling)

In [None]:
#from sklearn.utils import class_weight
#class_weights = class_weight.compute_class_weight('balanced',
  #                                               np.unique(train_data.label),
   #                                              train_data.label)

# Training the model

In [None]:
history = model.fit(X_train, y_train,
                    validation_split=0.1,
                    epochs=15, 
                    batch_size=16,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)], 
                    shuffle=True,
                   )


********
# Model Evaluation
******

##### Training accurancy vs validation accurancy 

In [None]:
plt.figure(figsize=(8,8));
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

##### Training loss vs validation loss 

In [None]:
plt.figure(figsize=(8,8));
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

*******
# Testing the model on unseen CRs
*******

#### Load tokenizer

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

#### Prepare test data

In [None]:
test_data.loc[test_data['label'] == 'B', 'LABEL'] = 0
test_data.loc[test_data['label'] == 'C', 'LABEL'] = 1
test_data.loc[test_data['label'] == 'D', 'LABEL'] = 2
test_data.loc[test_data['label'] == 'E', 'LABEL'] = 3
test_data.loc[test_data['label'] == 'F', 'LABEL'] = 4
test_data.loc[test_data['label'] == 'G', 'LABEL'] = 5
test_data.loc[test_data['label'] == 'I', 'LABEL'] = 6
test_data.loc[test_data['label'] == 'J', 'LABEL'] = 7
######

sequences_test=loaded_tokenizer.texts_to_sequences(test_data.CR_s)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
y_test = to_categorical(test_data['LABEL'], num_classes=8)
print('Shape of X train and X test tensor:', X_test.shape)
print('Shape of label train and test tensor:', y_test.shape)
model.evaluate(X_test,y_test)

# Save Model

In [None]:
# Save the weights
model.save_weights('model_weights.h5')

# Save the model architecture
with open('model_architecture.json', 'w') as f:
    f.write(model.to_json())

In [None]:
from keras.models import model_from_json

# Model reconstruction from JSON file
with open('model_GRU_architecture.json', 'r') as f:
    loaded_model = model_from_json(f.read())

# Load weights into the new model
loaded_model.load_weights('model_GRU_weights.h5')

In [None]:
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))