# Redes Neuronales NLP

In [1]:
import pandas as pd
import sklearn as sklearn

In [2]:
import tensorflow as tf

In [3]:
import os

df=pd.read_excel(
     os.path.join("data", "reclamos_20201221_sin_clas.xlsx"),
     engine='openpyxl',
)

In [4]:
df.columns

Index(['CASO_ID', 'DESCRIPCION_CIUDADANO', 'PETICION_CIUDADANO'], dtype='object')

In [5]:
df['DESCRIPCION_CIUDADANO']=df['DESCRIPCION_CIUDADANO'].map(str).str.lower()
df['PETICION_CIUDADANO']=df['PETICION_CIUDADANO'].map(str).str.lower() 

In [6]:
df['data'] = df['DESCRIPCION_CIUDADANO'] + df['PETICION_CIUDADANO']

In [7]:
import string, re

def remove_punct(x):
    comp = re.compile("[%s\d]" % re.escape(string.punctuation))
    return " ".join(comp.sub(" ", str(x)).split()).lower()

df['data']=df['data'].apply(remove_punct)

In [8]:
import spacy
import es_core_news_sm

nlp = es_core_news_sm.load()

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sergio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

stop_words=stopwords.words('spanish')
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stop_words]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to ted['transcript']
df['data'] = df['data'].apply(preprocess)
print(df['data'] )

0     conforme indicado oford xx sgd xx fecha noviem...
1     haber activar seguro incendio adicional rotura...
2     contratar seguro automotriz llamado seguro mag...
3     mayo robar camioneta usar robar cajero aparece...
4     ser propietario acción inmobiliaria estadio ár...
                            ...                        
95    padre don xx fallecer tenia cuatro acción gran...
96    presentar según reglamento documentación hacer...
97    octubre renovó seguro auto mismo aseguradora s...
98    ingrese vehículo taller piamonte irarrazaval d...
99    hacer reclamo siguiente aseguradora pagar corr...
Name: data, Length: 100, dtype: object


In [11]:
import unicodedata

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 'COMBINING GRAVE ACCENT', 'COMBINING TILDE')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

df['data'] = df['data'].apply(strip_accents)
print(df['data'] )

0     conforme indicado oford xx sgd xx fecha noviem...
1     haber activar seguro incendio adicional rotura...
2     contratar seguro automotriz llamado seguro mag...
3     mayo robar camioneta usar robar cajero aparece...
4     ser propietario accion inmobiliaria estadio ar...
                            ...                        
95    padre don xx fallecer tenia cuatro accion gran...
96    presentar segun reglamento documentacion hacer...
97    octubre renovo seguro auto mismo aseguradora s...
98    ingrese vehiculo taller piamonte irarrazaval d...
99    hacer reclamo siguiente aseguradora pagar corr...
Name: data, Length: 100, dtype: object


In [13]:
import pickle
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
input_array = tokenizer.texts_to_sequences(df['data'].values)

vocab_size = len(tokenizer.word_index) + 1  

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_test = pad_sequences(input_array, padding='post', maxlen=maxlen)

# Red Neuronal Recurrente

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, SpatialDropout1D,GlobalMaxPool1D

embedding_dim = 100

model = tf.keras.models.load_model('corfo_embedding.h5')
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          806800    
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 810,065
Trainable params: 810,065
Non-trainable params: 0
_________________________________________________________________


In [16]:


y_pred = model.predict(X_test)



In [17]:
y_pred.shape

(100, 1)

In [18]:
X_test.shape

(100, 100)

In [55]:
df['MERCADO_ANALISTA']=y_pred>0.5

In [34]:
def predict(y_pred):
    if y_pred > 0.5:
        return 'Reclamo Valores'
    else: 
        return 'APIA -Reclamo Seguros'

In [56]:
df['MERCADO_ANALISTA']

0     False
1     False
2     False
3     False
4      True
      ...  
95     True
96     True
97    False
98    False
99    False
Name: MERCADO_ANALISTA, Length: 100, dtype: bool

In [46]:
import numpy as np

df['MERCADO_ANALISTA'] = np.where((df['MERCADO_ANALISTA'] == True),'Reclamo Valores',df['MERCADO_ANALISTA'])

In [54]:
np.where((df['MERCADO_ANALISTA']==False))

(array([], dtype=int64),)

In [65]:
df['MERCADO_ANALISTA'].loc[df['MERCADO_ANALISTA']==False]='APIA -Reclamo Seguros'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [66]:
df['MERCADO_ANALISTA'].loc[df['MERCADO_ANALISTA']==True]='Reclamo Valores'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [67]:
df['MERCADO_ANALISTA']

0     APIA -Reclamo Seguros
1     APIA -Reclamo Seguros
2     APIA -Reclamo Seguros
3     APIA -Reclamo Seguros
4           Reclamo Valores
              ...          
95          Reclamo Valores
96          Reclamo Valores
97    APIA -Reclamo Seguros
98    APIA -Reclamo Seguros
99    APIA -Reclamo Seguros
Name: MERCADO_ANALISTA, Length: 100, dtype: object

In [68]:
df.columns

Index(['CASO_ID', 'DESCRIPCION_CIUDADANO', 'PETICION_CIUDADANO', 'data',
       'MERCADO_ANALISTA'],
      dtype='object')

In [72]:
df.drop(columns=['data'],axis=1,inplace=True)

In [73]:
df.to_excel('predicciones.xlsx')  