In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import keras
import six
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,AveragePooling1D,Flatten,concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import *

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('spanish'))

[nltk_data] Downloading package punkt to /home/roberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/roberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/roberto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#Abro el archivo en el que se encuentra el dataset de los problemas
with open('singleop.json', 'r') as f:
    datastore = json.load(f)

In [4]:
# Archivo que contiene un listado de nombres
nombres = pd.read_csv('nombres-2015.csv')
names = pd.read_csv('yob2019.txt', header=None)    

In [5]:
dataset2 = pd.read_csv('problemas_adicionales2.csv')

In [6]:
preguntas = []
respuestas = []
ecuaciones = []
alineacion = []

for item in datastore:
    preguntas.append(item['sQuestion'])
    respuestas.append(item['lSolutions'])
    ecuaciones.append(item['lEquations'])
    alineacion.append(item['lAlignments'])

In [7]:
# Necesito convertir el dataset en un problema de clasificacion para que la red neuronal pueda identificar
# si estoy tratando de resolver un problema de sumas, restas, multiplicaciones o divisiones.
# Esto va a crear una lista con el tipo de operacion y que va a ser el resultado a inferir.
operaciones = []
sumas =0
restas =0
multiplicaciones =0
divisiones = 0
otras = 0
#Clasifico las operaciones en 0 para sumas, 1 para restas, 2 para multiplicaciones, 3 para divisiones y 4 sino lo encuentro.
for operacion in ecuaciones:
    if (operacion[0].find('+')>=0):
        operaciones.append(0)
        sumas = sumas + 1
    elif (operacion[0].find('-') >= 0 ):
        operaciones.append(1)
        restas = restas + 1
    elif(operacion[0].find('*') >=0):
        operaciones.append(2)
        multiplicaciones = multiplicaciones + 1
    elif(operacion[0].find('/')):
        operaciones.append(3)
        divisiones = divisiones + 1
    else:
        operaciones.append(4)
        otras = otras + 1

print('Tengo ', sumas, ' sumas ', restas, ' restas, ', multiplicaciones, ' multiplicaciones, ', divisiones, ' divisiones y otras operaciones ', otras)

Tengo  159  sumas  162  restas,  117  multiplicaciones,  124  divisiones y otras operaciones  0


In [8]:
preguntas2 = dataset2['Preguntas'].tolist()
respuestas2 = dataset2['respuestas'].tolist()

In [9]:
preguntas3 = preguntas + preguntas2
respuestas3 = operaciones + respuestas2

In [10]:
#El listado de nombres lo voy a truncar a los 15K primeros, dado que el resto son nombres muy residuales.
nombres_ = nombres['nombre'][:15000]

In [11]:
names_= names[0]

In [12]:
nombres_ = nombres_.append(names_)

In [13]:
st_words = list(stop_words)

In [14]:
nom = nombres['nombre'].values.tolist()

In [15]:
nomb =  nom + st_words

In [16]:
nomb[-1]

'sea'

In [17]:
# El vector preguntas_sin, consiste en las preguntas a las que voy a eliminar todos los nombres propios que no
# anaden ningun valor al conjunto de preguntas. No quiero que esos nombres se procesen y por tanto los elimino.
def eliminar_palabras(dataset, stopw):
    preguntas_sin = []
    for palabras in dataset:
        frases = [word for word in palabras.split(' ') if word not in stopw]
        frases = " ".join(frases)
        preguntas_sin.append(frases)
    return preguntas_sin

In [18]:
preguntas_sin = eliminar_palabras(preguntas3, nomb)

In [19]:
from sklearn.utils import shuffle

preguntas3, respuestas3 = shuffle(preguntas_sin,respuestas3)

In [55]:
frases = []
tokenizer = RegexpTokenizer(r'\w+')

for i,pregunta in enumerate(preguntas_sin):
    palabras = tokenizer.tokenize(pregunta)
    preguntas_w = []
    for palabra in palabras:
        preguntas_w.append(palabra)
    frases.append(preguntas_w)

In [21]:
palabras_unicas = set(preguntas_w)

In [65]:
len(palabras_unicas)

2305

In [66]:
p_unicas = list(palabras_unicas)

#Voy a convertir los indices a escala logaritmica para evitar que puedan reventar los pesos en la red neuronal
vocabulario = {p:i*0.0001 for i, p in enumerate(p_unicas)}

In [67]:
tamanoMedio = 0
tamanoTotal = 0

for pregunta in frases:
    if(len(pregunta) > tamanoTotal):
        tamanoTotal = len(pregunta)
    tamanoMedio = tamanoMedio + len(pregunta)

print("El tamano max es ", tamanoTotal, " y la media de la longitud de las frases es de", tamanoMedio/len(frases))

El tamano max es  31  y la media de la longitud de las frases es de 12.803901437371664


In [87]:
# Voy a meter en este vector todas mis preguntas y todas las palabras.
# Para esta prueba, vamos a poner en la posición de la frase, el número de la palabra que estamos procesando.
# El objetivo es procesar las palabras teniendo en cuenta el orden secuencial de la frase.
bag = np.zeros([len(frases), tamanoTotal, 1])

In [88]:
for i, pregunta in enumerate(frases):
    for j, palabras in enumerate(pregunta):
        bag[i, tamanoTotal - len(pregunta) + j, 0] = vocabulario[palabras]

In [89]:
bag[0]

array([[0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.    ],
       [0.1292],
       [0.0123],
       [0.2296],
       [0.0049],
       [0.1921],
       [0.0834],
       [0.0123],
       [0.2296],
       [0.2054],
       [0.1958],
       [0.2296],
       [0.1921]])

In [90]:
training_X = np.asarray(bag[:800])
test_X = np.asarray(bag[800:])
training_y = np.asarray(respuestas3[:800])
test_y = np.asarray(respuestas3[800:])


In [91]:
# Creamos un modelo donde el primer argumento de la capa embedding son las palabras totales que voy a procesar
# vectorizadas en un indice.
# El segundo argumento, es el tamano del vector embedding, que he fijado en 16.
# El tercer argumento, es el tamano o longitud maxima, que he definido para las preguntas. Numero total de palabras
# por pregunta.
model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(36, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(4, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [98]:
training_y

array([3, 1, 0, 0, 0, 1, 1, 1, 3, 2, 2, 2, 0, 1, 2, 1, 2, 3, 2, 0, 3, 0,
       0, 1, 2, 2, 3, 1, 1, 1, 2, 1, 3, 0, 2, 0, 2, 1, 2, 1, 3, 1, 3, 3,
       0, 1, 0, 0, 0, 2, 2, 0, 2, 0, 0, 1, 1, 2, 1, 2, 1, 3, 1, 0, 0, 1,
       1, 3, 2, 3, 2, 1, 3, 0, 2, 3, 1, 0, 0, 3, 3, 0, 1, 3, 1, 1, 2, 1,
       2, 2, 1, 3, 0, 0, 1, 1, 0, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 1, 0, 0,
       1, 3, 2, 1, 2, 3, 2, 0, 0, 3, 1, 1, 2, 1, 2, 2, 0, 3, 0, 1, 3, 0,
       2, 3, 1, 3, 1, 1, 2, 2, 1, 0, 3, 2, 1, 1, 1, 3, 2, 0, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 1, 1, 1, 3, 3, 2, 2, 3, 1, 3, 0, 3,
       1, 1, 0, 2, 3, 0, 3, 2, 2, 0, 3, 2, 1, 0, 0, 3, 1, 1, 1, 3, 3, 1,
       0, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 0, 3, 2, 1, 1, 2, 1, 1, 1, 0, 1,
       0, 2, 0, 0, 2, 1, 0, 1, 0, 0, 0, 3, 1, 1, 1, 1, 0, 2, 2, 1, 0, 2,
       1, 0, 3, 1, 2, 3, 2, 1, 1, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0, 3, 1,
       0, 0, 1, 2, 1, 2, 3, 2, 1, 1, 1, 3, 1, 0, 0, 2, 1, 1, 2, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 2, 3, 3, 1, 2, 1, 0, 1,

In [92]:
history = model.fit(training_X, training_y, epochs = 300, validation_data=(test_X, test_y), batch_size = 128)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300


Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300


Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300


Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300


Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300


Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


In [101]:
model2 = keras.Sequential([
    keras.layers.Bidirectional(keras.layers.LSTM(128, dropout = 0.2, input_shape=[bag.shape[0], bag.shape[1], 1])),
    keras.layers.Dense(4, activation='softmax')
])
opt = keras.optimizers.RMSprop(
    learning_rate=0.001,
    rho=0.9,
    momentum=0.0,
    epsilon=1e-07,
    centered=False,
)

#opt = keras.optimizers.SGD(lr=0.00001, decay=1e-6, momentum=0.9, nesterov=True)
#opt = keras.optimizers.Adam(learning_rate = 0.0001)
model2.compile(loss='sparse_categorical_crossentropy',optimizer=opt, metrics=['accuracy'])

In [102]:
training_X.shape

(800, 31, 1)

In [103]:
model2.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [None]:
history2 = model2.fit(training_X, training_y, epochs = 240, batch_size= 128, validation_data=(test_X, test_y))

Epoch 1/240
Epoch 2/240
Epoch 3/240
Epoch 4/240
Epoch 5/240
Epoch 6/240
Epoch 7/240
Epoch 8/240
Epoch 9/240
Epoch 10/240
Epoch 11/240
Epoch 12/240
Epoch 13/240
Epoch 14/240
Epoch 15/240
Epoch 16/240
Epoch 17/240
Epoch 18/240
Epoch 19/240
Epoch 20/240
Epoch 21/240
Epoch 22/240
Epoch 23/240
Epoch 24/240
Epoch 25/240
Epoch 26/240
Epoch 27/240
Epoch 28/240
Epoch 29/240
Epoch 30/240
Epoch 31/240
Epoch 32/240
Epoch 33/240
Epoch 34/240
Epoch 35/240
Epoch 36/240
Epoch 37/240
Epoch 38/240
Epoch 39/240
Epoch 40/240
Epoch 41/240
Epoch 42/240
Epoch 43/240
Epoch 44/240
Epoch 45/240
Epoch 46/240
Epoch 47/240
Epoch 48/240
Epoch 49/240
Epoch 50/240
Epoch 51/240
Epoch 52/240
Epoch 53/240
Epoch 54/240
Epoch 55/240
Epoch 56/240
Epoch 57/240
Epoch 58/240
Epoch 59/240


Epoch 60/240
Epoch 61/240
Epoch 62/240
Epoch 63/240
Epoch 64/240
Epoch 65/240
Epoch 66/240
Epoch 67/240
Epoch 68/240
Epoch 69/240
Epoch 70/240
Epoch 71/240
Epoch 72/240
Epoch 73/240
Epoch 74/240
Epoch 75/240
Epoch 76/240
Epoch 77/240
Epoch 78/240
Epoch 79/240
Epoch 80/240
Epoch 81/240
Epoch 82/240
Epoch 83/240
Epoch 84/240
Epoch 85/240
Epoch 86/240
Epoch 87/240
Epoch 88/240
Epoch 89/240
Epoch 90/240
Epoch 91/240
Epoch 92/240
Epoch 93/240
Epoch 94/240
Epoch 95/240
Epoch 96/240
Epoch 97/240
Epoch 98/240
Epoch 99/240
Epoch 100/240
Epoch 101/240
Epoch 102/240
Epoch 103/240
Epoch 104/240
Epoch 105/240
Epoch 106/240
Epoch 107/240
Epoch 108/240
Epoch 109/240
Epoch 110/240
Epoch 111/240
Epoch 112/240
Epoch 113/240
Epoch 114/240
Epoch 115/240
Epoch 116/240
Epoch 117/240


Epoch 118/240
Epoch 119/240
Epoch 120/240
Epoch 121/240
Epoch 122/240
Epoch 123/240
Epoch 124/240
Epoch 125/240
Epoch 126/240
Epoch 127/240
Epoch 128/240
Epoch 129/240
Epoch 130/240
Epoch 131/240
Epoch 132/240
Epoch 133/240
Epoch 134/240
Epoch 135/240
Epoch 136/240
Epoch 137/240
Epoch 138/240
Epoch 139/240
Epoch 140/240
Epoch 141/240
Epoch 142/240
Epoch 143/240
Epoch 144/240
Epoch 145/240
Epoch 146/240
Epoch 147/240
Epoch 148/240
Epoch 149/240
Epoch 150/240
Epoch 151/240
Epoch 152/240
Epoch 153/240
Epoch 154/240
Epoch 155/240
Epoch 156/240
Epoch 157/240
Epoch 158/240
Epoch 159/240
Epoch 160/240
Epoch 161/240
Epoch 162/240
Epoch 163/240
Epoch 164/240
Epoch 165/240
Epoch 166/240
Epoch 167/240
Epoch 168/240
Epoch 169/240
Epoch 170/240
Epoch 171/240
Epoch 172/240
Epoch 173/240
Epoch 174/240


Epoch 175/240
Epoch 176/240
Epoch 177/240
Epoch 178/240
Epoch 179/240
Epoch 180/240
Epoch 181/240
Epoch 182/240
Epoch 183/240
Epoch 184/240
Epoch 185/240
Epoch 186/240
Epoch 187/240
Epoch 188/240
Epoch 189/240
Epoch 190/240
Epoch 191/240
Epoch 192/240
Epoch 193/240
Epoch 194/240
Epoch 195/240
Epoch 196/240
Epoch 197/240
Epoch 198/240
Epoch 199/240
Epoch 200/240
Epoch 201/240
Epoch 202/240
Epoch 203/240
Epoch 204/240
Epoch 205/240
Epoch 206/240
Epoch 207/240
Epoch 208/240
Epoch 209/240
Epoch 210/240
Epoch 211/240
Epoch 212/240
Epoch 213/240
Epoch 214/240
Epoch 215/240
Epoch 216/240
1/7 [===>..........................] - ETA: 0s - loss: 1.3194 - accuracy: 0.3750