In [16]:
# !pip install fastparquet --user
# !pip install pyarrow --user
# !pip install swifter --use
# !pip install matplotlib --user
# !pip install seaborn --user
# !pip install joblib --user

Collecting joblib
  Using cached https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl
Installing collected packages: joblib
Successfully installed joblib-0.13.2


In [17]:
# Carrega as bibliotecas de ambiente

import os
import io
import gc
import re
import glob
import string
import requests
import collections

path = os.getcwd()

for dirname, _, filenames in os.walk('input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/train.pickle
input/test.csv
input/train.csv
input/test.pickle
input/sample_submission.csv
input/train.csv.gz


In [18]:
# Carrega as bibliotecas de ciências e gráficos

import pickle

import theano
import nltk
import swifter
import multiprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
import fastparquet as fpq
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from joblib import Parallel, delayed

from fastparquet import ParquetFile
from fastparquet import write

from numba import vectorize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

from sklearn import preprocessing
from sklearn import metrics

warnings.filterwarnings('ignore')
plt.switch_backend('agg')
%matplotlib inline

gc.get_threshold()

Using TensorFlow backend.


(700, 10, 10)

In [19]:
# Importação das stopwords do pacote nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ricardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Caminho para os arquivos de dados
PATH = "input/"

# Regex
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
ONLY_STRING_WORD_RE = re.compile(r'\w*\d\w*')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
LESS_THAN_WORD_RE = re.compile(r'\b\w{1,2,3}\b')
REMOVE_NUMBERS_RE = re.compile(' \d+')

# Stopwords
STOPWORDS_0 = set(stopwords.words('english'))
STOPWORDS_1 = set(stopwords.words('portuguese'))
STOPWORDS_2 = set(stopwords.words('spanish'))

# Número do bloco de leitura dos arquivos
FILE_SIZE = 1000000
# Número máximo de palavras usadas mais frequentes
MAX_NB_WORDS = 50000
# Numero máximo de palavras para saída
MAX_SEQUENCE_LENGTH = 100
# Fixador.
EMBEDDING_DIM = 100
# Variável randomica
RANDOM_STATE = 2011

# Número de épocas
EPOCHS = 15
# Tamanho do bloco
BATCH_SIZE = 64

In [21]:
# Grava Dataframe para Arquivos Parquet
def write_parquet(df, file):
    fparq = PATH + "/files/" + file + ".parquet" 
    write(fparq, df, row_group_offsets=[0, 10000, 20000], compression='GZIP', file_scheme='hive')

# Cria ou lê os dados pickle
def file_pickle(file):
    fpkl = PATH + file + ".pickle"
    fcsv = PATH + file + ".csv"
  
    if os.path.isfile(fpkl):
        df = pd.read_pickle(fpkl)
    else:        
        df = pd.read_csv(fcsv, header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True)
        df.to_pickle(fpkl)

    return df

# Leitura de Arquivos Fracionados
def file_chunk(file):    
    for data in pd.read_csv(fcsv, chunksize=FILE_SIZE, header=0, sep=',', quotechar='"', error_bad_lines=False, skipinitialspace=True):
        print(data.shape)

# Atualiza o arquivo pickle com novas informações
def update_pickle(file, df):
    fpkl = PATH + file + ".pkl"
    df.to_pickle(fpkl)
    
# Imprime os dados relativos ao indice passado
def print_plot(index):
    example = dftrain[dftrain.index == index][['title', 'category']].values[0]
    
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
        
# Limpeza dos dados: lower case; espaços do texto; caracteres especiais e simbolos; stop words e digitos
def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = LESS_THAN_WORD_RE.sub('', text) # replace LESS_THAN_WORD symbols by space in text. substitute the matched string in LESS_THAN_WORD with space.
    text = ONLY_STRING_WORD_RE.sub('', text) # replace ONLY_STRING_WORD_RE REMOVE words with numbers and letters in text. substitute the matched string in ONLY_STRING_WORD_RE with space.    
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = REMOVE_NUMBERS_RE.sub('', text) # remove numbers which are in REMOVE_NUMBERS from text. substitute the matched string in REMOVE_NUMBERS with nothing. 

    text = ' '.join(word for word in text.split() if word not in STOPWORDS_0) # remove stopwors english from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS_1) # remove stopwors portugues from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS_2) # remove stopwors spanish from text
    text = text.replace('\d+', '')        
    return text

# Tokenização de textos
def token_text(text):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=string.punctuation, lower=True)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return sequences

# Conversão de texto para variável categórica
def dummie_text(text):
    dummies = pd.get_dummies(text).values    
    return dummies

# Função principal de limpeza e tokenização (wording embedding)
def converter_text(row):
    title = clean_text(row["title"])
    row["title_"] = token_text(title)
    #row["language_"] = dummie_text(row["language"])
  
    if row.isin(['category']).any():
        row["category_"] = dummie_text(row["category"])
    row
    return row

In [None]:
# dftrain['titles'] = dftrain['title'].apply(clean_text)

In [23]:
dftest = file_pickle("test")
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246955 entries, 0 to 246954
Data columns (total 3 columns):
id          246955 non-null int64
title       246955 non-null object
language    246955 non-null object
dtypes: int64(1), object(2)
memory usage: 5.7+ MB


In [24]:
dtype = {'title':int, 'label_quality':str, 'language':str, 'category':str}
dftrain = file_pickle("train")
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000000 entries, 0 to 19999999
Data columns (total 4 columns):
title            object
label_quality    object
language         object
category         object
dtypes: object(4)
memory usage: 610.4+ MB


In [None]:
dftest["title_"] = None
dftest["language_"] = None
dftest.drop(columns="category_")

In [None]:
num_cores = multiprocessing.cpu_count()
     
results = Parallel(n_jobs=num_cores)(delayed(converter_text)(row) for row in dftest)

In [None]:
# # Limpeza
#dftest.apply(lambda row: converter_text(row), axis=1)
%timeit
dftest['title_'] = dftest['title'].swifter.apply(clean_text)
gc.collect()

dftest['title_'] = dftest['title_'].swifter.apply(token_text)
gc.collect()

dftest['language_'] = dftest['language'].swifter.apply(dummie_text)
gc.collect()
# Atualização do dump

#update_pickle("test", dftest)

# Avaliação do resultado
dftest.head(10)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=246955, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=246955, style=ProgressStyle(description_wi…

In [None]:
# # Limpeza
dftrain['title_'] = dftrain['title'].swifter.apply(clean_text)
dftrain['title_'] = dftrain['title_'].swifter.apply(token_text)
dftrain['language_'] = dftrain['language'].swifter.apply(dummie_text)
dftrain['category_'] = dftrain['category'].swifter.apply(dummie_text)
#dftrain.swifter.apply(lambda row: converter_text(row), axis=1)

# Atualização do dump
update_pickle("train", dftrain)

# Avaliação do resultado
dftrain.head()