## Import delle librerie

In [11]:
from ipywidgets import interact_manual
from ipywidgets import widgets

import re
import string
import pickle
import numpy as np
import pandas as pd
import progressbar
import matplotlib.pyplot as plt
import os
import PyPDF2

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import nltk
from nltk.corpus import stopwords

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping

maxlen = 600

layout = widgets.Layout(width='auto', height='300px') #set width and height

## Test modelli keras

### Scelta  del modello da utilizzare

In [2]:
# scegli modello tra fastext, conv1d, bilstm
key_model = "conv1d"

### Import tokenizer

In [3]:
with open('models/' + key_model + '/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [4]:
print(type(tokenizer))

<class 'keras_preprocessing.text.Tokenizer'>


### Import label encoder

In [5]:
with open('models/' + key_model + '/label_encoder.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)

In [6]:
print(type(label_encoder))

<class 'sklearn.preprocessing._label.LabelEncoder'>


### Import del modello

Attraverso il metodo **load_model** è possibile ricaricare un modello precedentemente addestrato e salvato tramite il metodo **save()**.
Tramite il metodo **save()** vengono salvate le seguenti informazioni:
*   L'architettura del modello
*   I pesi del modello
*   la configurazione di training
*   lo stato dell'optimizer, per permettere di riprendere l'addestramento esattamente da dove è terminato

In [7]:
model = load_model('models/' + key_model + '/model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [8]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 100)          3250400   
_________________________________________________________________
dropout_1 (Dropout)          (None, 600, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 598, 250)          75250     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)              

## Test del modello

In [None]:
rootpath = 'C:/Users/daniele/Documents/archiviazione_automatica/clienti/'

In [None]:
filepath = 'Cliente GOTTARDO/010987-160498 - CONSULENZA JBPM-PORTALE/Contratto/MP16G21.pdf'

In [None]:
try:
    pdfFileObj = open((rootpath + filepath), 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    pageObj = pdfReader.getPage(0)
    text = pageObj.extractText()
    text = text.replace('\n', '')
except:
    print('There was an error opening the file!') 

In [9]:
def get_prediction(sentence):
    #Tokenization
    test_sentence = tokenizer.texts_to_sequences([sentence])
    # Preprocessing
    x_sent = pad_sequences(test_sentence, maxlen=maxlen)
    # Evaluation
    prob_array = model.predict(x_sent)[0]
    prediction = model.predict_classes(x_sent)
    probability = prob_array[prediction[0]]
    result = label_encoder.inverse_transform(prediction)
    # Visualization
    print("La categoria è " + str(result[0]) + " con probabilità " + str(probability))

In [None]:
get_prediction(text)

In [10]:
interact_manual(get_prediction, sentence=widgets.Textarea(placeholder='Type your sentence here', layout = layout));

interactive(children=(Textarea(value='', description='sentence', layout=Layout(height='300px', width='auto'), …

## Riaddestramento del modello

In [None]:
data = pd.read_csv('../data/new_papers_for_retraining.csv')

In [None]:
abstract_list = data['abstract'].tolist()

In [None]:
def cleanupDoc(s):
     stopset = set(stopwords.words('english'))
     tokens = nltk.word_tokenize(s)
     cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
     return cleanup

In [None]:
abstract_list_cleaned = []

for abstract in progressbar.progressbar(abstract_list):
    tokens = nltk.word_tokenize(abstract)
    x = cleanupDoc(abstract)
    abstract_list_cleaned.append(x)

In [None]:
labels = data['primary_cat']

In [None]:
plt.hist(labels)

In [None]:
values = np.array(labels)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
Y_train = onehot_encoder.fit_transform(integer_encoded)
print(Y)

In [None]:
tokenizer.fit_on_texts(abstract_list_cleaned)
X = tokenizer.texts_to_sequences(abstract_list_cleaned)

In [None]:
batch_size = 64 # batch size per l'addestramento
epochs = 10

In [None]:
callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto',
    baseline=None, restore_best_weights=False)

history = model.fit(X_train, Y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1, callbacks = [callback])

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()