## 1. Importação de Biblotecas de Apoio

In [4]:
# !pip install --upgrade pip --user
# !pip install tensorflow==2.0.0-rc0 --user
# !pip install keras --user
# !pip install --user -U nltk


# !conda create --name PythonGPU
# !activate PythonCPU
# !conda install -c anaconda keras
# !conda install -c anaconda keras-gpu

# !conda install -c theano
# !conda install -c conda-forge keras tensorflow

Collecting nltk
  Using cached https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.4.5-cp37-none-any.whl size=1449904 sha256=a3b662b5b6914ebaf90c2d3783e0029254a199c78d4897c0063f98694653a382
  Stored in directory: /home/ricardo/.cache/pip/wheels/96/86/f6/68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.5


## 2. Preparação do Ambiente

### 2.1 - Importação das Biblotecas Base

In [2]:
# Carrega as bibliotecas de ambiente

import os
import io
import gc
import re
import string
import requests
import collections

path = os.getcwd()

for dirname, _, filenames in os.walk('input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/test.csv
input/train.csv
input/sample_submission.csv
input/train.csv.gz
input/test.pkl
input/train.pkl


### 2.2 - Importação das Biblotecas Específicas

In [6]:
# Carrega as bibliotecas de ciências e gráficos

import pickle

import theano
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from numba import vectorize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

from sklearn import preprocessing
from sklearn import metrics

warnings.filterwarnings('ignore')
plt.switch_backend('agg')
%matplotlib inline

gc.get_threshold()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


(700, 10, 10)

### 2.3 - Importação de Dados de Pacotes

In [7]:
# Importação das stopwords do pacote nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ricardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2.4 - Definição das Constantes de Configuração

In [9]:
# Caminho para os arquivos de dados
PATH = "input/"

# Regex
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
LESS_THAN_WORD = re.compile(r'\b\w{1,2}\b')
REMOVE_NUMBERS = re.compile(' \d+')

# Stopwords
STOPWORDS_0 = set(stopwords.words('english'))
STOPWORDS_1 = set(stopwords.words('portuguese'))
STOPWORDS_2 = set(stopwords.words('spanish'))


# Número máximo de palavras usadas mais frequentes
MAX_NB_WORDS = 50000
# Numero máximo de palavras para saída
MAX_SEQUENCE_LENGTH = 250
# Fixador.
EMBEDDING_DIM = 100
# Variável randomica
RANDOM_STATE = 2011

# Número de épocas
EPOCHS = 15
# Tamanho do bloco
BATCH_SIZE = 64

## 3. Funções de Apoio

In [33]:
# Cria ou lê os dados pickle
def file_pickle(file):
    fpkl = PATH + file + ".pkl"
    fcsv = PATH + file + ".csv"
  
    if os.path.isfile(fpkl):
        df = pd.read_pickle(fpkl)
    else:        
        df = pd.read_csv(fcsv, header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)
        df.to_pickle(fpkl)

    return df

# Atualiza o arquivo pickle com novas informações
def update_pickle(file, df):
    fpkl = PATH + file + ".pkl"
    df.to_pickle(fpkl)
    
# Imprime os dados relativos ao indice passado
def print_plot(index):
    example = dftrain[dftrain.index == index][['title', 'category']].values[0]
    
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
        
# Limpeza dos dados: lower case; espaços do texto; caracteres especiais e simbolos; stop words e digitos
def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = LESS_THAN_WORD.sub('', text) # replace LESS_THAN_WORD symbols by space in text. substitute the matched string in LESS_THAN_WORD with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = REMOVE_NUMBERS.sub('', text) # remove numbers which are in REMOVE_NUMBERS from text. substitute the matched string in REMOVE_NUMBERS with nothing. 
    text = text.replace('  ', ' ')

    #    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS_0) # remove stopwors english from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS_1) # remove stopwors portugues from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS_2) # remove stopwors spanish from text
    text = text.replace('\d+', '')
        
    return text

# Tokenização de textos
def token_text(text):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=string.punctuation, lower=True)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return sequences

# Conversão de texto para variável categórica
def dummie_text(text):
    dummies = pd.get_dummies(text).values
    
    return dummies

## 4. Coleta de Dados

### 4.1 - Carga dos Dados de Treino

In [25]:
# urltrain = "https://meli-data-challenge.s3.amazonaws.com/train.csv.gz"
# ctrain = requests.get(urltrain).content
# ftrain = pd.read_csv(io.StringIO(ctrain.decode('utf-8')), compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)
# dftrain = pd.read_csv("input/train.csv.gz", compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)

dftrain = file_pickle("train")

Unnamed: 0,title,label_quality,language,category
0,Hidrolavadora Lavor One 120 Bar 1700w Bomba A...,unreliable,spanish,ELECTRIC_PRESSURE_WASHERS
1,Placa De Sonido - Behringer Umc22,unreliable,spanish,SOUND_CARDS
2,Maquina De Lavar Electrolux 12 Kilos,unreliable,portuguese,WASHING_MACHINES
3,Par Disco De Freio Diant Vent Gol 8v 08/ Frema...,unreliable,portuguese,VEHICLE_BRAKE_DISCS
4,Flashes Led Pestañas Luminoso Falso Pestañas P...,unreliable,spanish,FALSE_EYELASHES


### 4.2 - Carga dos Dados de Teste

In [26]:
# urltest = "https://meli-data-challenge.s3.amazonaws.com/test.csv"
# ctest = requests.get(urltest).content
# ftest = pd.read_csv(io.StringIO(ctest.decode('utf-8')), header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)
# dftest = pd.read_csv("input/test.csv", header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)

dftest = file_pickle("test")

Unnamed: 0,id,title,language
0,0,Kit Maternidade Bolsa-mala Baby/bebe Vinho Men...,portuguese
1,1,Trocador De Fraldas Fisher Price Feminino Rosa...,portuguese
2,2,Motor Ventoinha - Fiat Idea / Palio 1.8 - A 04...,portuguese
3,3,Amortecedor Mola Batente D Dir New Civic 14 - ...,portuguese
4,4,Cadeirinha De Carro Bebê Princesa Princess 9 A...,portuguese


### 4.3 - Limpeza da Memória

In [27]:
gc.collect()
gc.get_threshold()

(700, 10, 10)

## 5. Processamento/Tratamento de Dados

### 5.1 - Análise dos dados de treino

In [28]:
# Verifica a estrutura básica dos dados de treino

print('Shape of dataset ',dftrain.shape)
print(dftrain.columns)

dftrain.head(5)

Shape of dataset  (20000000, 4)
Index(['title', 'label_quality', 'language', 'category'], dtype='object')


Unnamed: 0,title,label_quality,language,category
0,Hidrolavadora Lavor One 120 Bar 1700w Bomba A...,unreliable,spanish,ELECTRIC_PRESSURE_WASHERS
1,Placa De Sonido - Behringer Umc22,unreliable,spanish,SOUND_CARDS
2,Maquina De Lavar Electrolux 12 Kilos,unreliable,portuguese,WASHING_MACHINES
3,Par Disco De Freio Diant Vent Gol 8v 08/ Frema...,unreliable,portuguese,VEHICLE_BRAKE_DISCS
4,Flashes Led Pestañas Luminoso Falso Pestañas P...,unreliable,spanish,FALSE_EYELASHES


In [29]:
# Verificação das caracteristicas de cada coluna do arquivo

dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000000 entries, 0 to 19999999
Data columns (total 4 columns):
title            object
label_quality    object
language         object
category         object
dtypes: object(4)
memory usage: 610.4+ MB


In [30]:
# Verifica se exitem dados nulos no geral

if dftrain.isnull().values.any():
    dftrain[dftrain.isnull().any(axis=1)] 
else:
    print("Nos dados de treino não existem dados nulos.")

Nos dados de treino não existem dados nulos.


In [31]:
# Remove dados ausentes

dftrain = dftrain.dropna()

In [32]:
# Redefinindo o indice do dataframe

dftrain = dftrain.reset_index(drop=True)

In [14]:
# Verificando os tipos de categorias

dftrain.category.value_counts()

PANTS                                   35973
COFFEE_MAKERS                           35104
BABY_CAR_SEATS                          34163
MUSICAL_KEYBOARDS                       33222
MATTRESSES                              32967
PUREBRED_DOGS                           32928
RANGES                                  32645
REFRIGERATORS                           32635
MOTORCYCLE_JACKETS                      32615
HAIR_CLIPPERS                           32372
SHORTS                                  31685
SUITCASES                               31580
MEMORY_CARDS                            31564
WINES                                   31399
ROLLER_SKATES                           31371
BABY_STROLLERS                          31353
SEWING_MACHINES                         31129
ELECTRIC_DRILLS                         30820
KITCHEN_SINKS                           30635
WALL_CLOCKS                             30600
FLASHLIGHTS                             29960
CV_JOINTS                         

### 5.2 - Análise dos dados de teste

In [15]:
# Verifica a estrutura básica dos dados de teste

print('Shape of dataset ',dftest.shape)
print(dftest.columns)

dftest.head(5)

Shape of dataset  (246955, 3)
Index(['id', 'title', 'language'], dtype='object')


Unnamed: 0,id,title,language
0,0,Kit Maternidade Bolsa-mala Baby/bebe Vinho Men...,portuguese
1,1,Trocador De Fraldas Fisher Price Feminino Rosa...,portuguese
2,2,Motor Ventoinha - Fiat Idea / Palio 1.8 - A 04...,portuguese
3,3,Amortecedor Mola Batente D Dir New Civic 14 - ...,portuguese
4,4,Cadeirinha De Carro Bebê Princesa Princess 9 A...,portuguese


In [16]:
# Verificação das caracteristicas de cada coluna do arquivo

dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246955 entries, 0 to 246954
Data columns (total 3 columns):
id          246955 non-null int64
title       246955 non-null object
language    246955 non-null object
dtypes: int64(1), object(2)
memory usage: 5.7+ MB


In [17]:
# Verifica se exitem dados nulos no geral

if dftest.isnull().values.any():
    dftest[dftest.isnull().any(axis=1)] 
else:
   print("Nos dados de teste não existem dados nulos.") 

Nos dados de teste não existem dados nulos.


In [18]:
# Remove dados ausentes

dftest = dftest.dropna()

In [19]:
# Redefinindo o indice do dataframe

dftest = dftest.reset_index(drop=True)

### 5.3 - Limpeza

#### 5.3.1 - Dados de Treino

In [34]:
# Limpeza
dftrain['titles'] = dftrain['title'].apply(clean_text)

# Atualização do dump
update_pickle("train", dftrain)

Unnamed: 0,id,title,language
0,0,kit maternidade bolsamala baby bebe vinho meni...,portuguese
1,1,trocador fraldas fisher price feminino rosa po...,portuguese
2,2,motor ventoinha fiat idea palio 0417,portuguese
3,3,amortecedor mola batente dir new civic 7051,portuguese
4,4,cadeirinha carro beb princesa princess kgs,portuguese


#### 5.3.2 - Dados de Teste

In [None]:
# Limpeza
dftest['titles'] = dftest['title'].apply(clean_text)

# Atualização do dump
update_pickle("test", dftest)

### 5.4 - Convertendo as variáveis caracter  em categóricas

#### 5.4.1 - Dados de Treino

In [None]:
# Conversão - Lingua
dftrain['language_'] = dftrain['language'].apply(dummie_text)

# Atualização do dump
update_pickle("train", dftrain)

In [None]:
# Conversão - Categoria
dftrain['category_'] = dftrain['category'].apply(dummie_text)

# Atualização do dump
update_pickle("train", dftrain)

#### 5.4.2 - Dados de Teste

In [None]:
# Conversão - Lingua
dftest['language_'] = dftest['language'].apply(dummie_text)

# Atualização do dump
update_pickle("test", dftest)

### 5.5 - Tokenização

#### 5.5.1 - Dados de Treino

In [20]:
# Tokenizando os testos do Title - Treino
dftrain['title_'] = dftrain['titles'].apply(token_text)

# Atualização do dump
update_pickle("train", dftrain)

#### 5.5.2 - Dados de Teste

In [22]:
# Tokenizando os testos do Title - Teste

dftest['title_'] = dftest['titles'].apply(token_text)

# Atualização do dump
update_pickle("test", dftest)

## 6. Análise e Exploração dos Dados

### 6.1 Análise dos Dados

In [None]:
# Verificação de como os textos ficaram após a limpeza:

print_plot(100)

In [39]:
dftrain[["title", "language"]].head()

Unnamed: 0,title,language
0,Hidrolavadora Lavor One 120 Bar 1700w Bomba A...,spanish
1,Placa De Sonido - Behringer Umc22,spanish
2,Maquina De Lavar Electrolux 12 Kilos,portuguese
3,Par Disco De Freio Diant Vent Gol 8v 08/ Frema...,portuguese
4,Flashes Led Pestañas Luminoso Falso Pestañas P...,spanish


## 7. Preparração dos dados para aplicação dos Modelos de Machine Learning

In [None]:
# Criando as variáveis para execução dos modelos

X_train = dftrain[["title_", "language_"]]
print('Shape of data tensor:', X.shape)

Y_train = dftrain["category_"]
print('Shape of label tensor:', Y.shape)

X_test = dftest[["title_", "language_"]]
print('Shape of data tensor:', X.shape)

## 8. Verificação do melhor modelo

In [None]:
### KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, Y_train)
print(neigh.predict(X_test))

### 8.1 - LSTM

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

#### 8.1.1 - Simulação do modelo com os dados de teste

In [None]:
padded = dftest["title"].apply(token_text)
pred = model.predict(padded)

labels = dftrain['category'].value_counts()
print(pred, labels[np.argmax(pred)])

In [None]:
### 8.2 - CNN

In [None]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(macronum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp = ModelCheckpoint('model_cnn.hdf5', monitor='val_acc',verbose=1,save_best_only=True)

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=EPOCHS, batch_size=BATCH_SIZE,callbacks=[cp])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :CNN',fontsize=16)
fig1.savefig('loss_cnn.png')
plt.show()

In [None]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : CNN',fontsize=16)
fig2.savefig('accuracy_cnn.png')
plt.show()

#### 8.2.1 - Simulação do modelo com os dados de teste

In [None]:
padded = dftest["title"].apply(token_text)
pred = model.predict(padded)

labels = dftrain['category'].value_counts()
print(pred, labels[np.argmax(pred)])

In [None]:
### 8.3 - RNN

#### 8.3.1 - Simulação do modelo com os dados de teste

In [None]:
padded = dftest["title"].apply(token_text)
pred = model.predict(padded)

labels = dftrain['category'].value_counts()
print(pred, labels[np.argmax(pred)])

In [None]:
### 8.4 - HAN

#### 8.4.1 - Simulação do modelo com os dados de teste

In [None]:
padded = dftest["title"].apply(token_text)
pred = model.predict(padded)

labels = dftrain['category'].value_counts()
print(pred, labels[np.argmax(pred)])