In [1]:
#!/usr/bin/env python3
# -*- coding: latin-1 -*-
"""
Created on Mon Aug 14 22:14:55 2017
@author: rodrigo
"""

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import re

In [41]:
# Importing the dataset
# by a first class
def get_dataset(first_class = 'H', max_N=0):
    
    X = []
    y = []
    
    with open('../GitHub/output/full.csv', 'r', encoding="latin-1") as file:
        i_N = 0
        for line in file:
            if max_N == 0 or i_N < max_N:
                data = line.split('|')
                if len(data) == 7:
                    data_classes = data[1].split(',')
                    cl_set = set()
                    for cl in data_classes:
                        c = cl.upper()
                        c = c.replace(' ', '')
                        a = re.search(r'([A-Z])([0-9]{2})([A-Z])', c, re.M|re.I)
                        try:
                            if a.group(1) == first_class:
                                cl_set.add(a.group(1) + a.group(2) + a.group(3))
                        except:
                            pass
                    if len(cl_set) > 0:
                        X.append([data[2], data[3]])
                        y.append(list(cl_set))
                        i_N += 1
    return [X, y]


In [42]:
# index 0: title, summary
# index 1: list of classes by id
# index 2: classes by name and their respective ids
dataset = get_dataset(first_class = 'H')

In [44]:
# example
dataset[0][0:5]

[["Envolt'orio para m'aquinas el'etricas",
  "'E proposto em envolt'orio para m'aquinas el'etricas, sobretudo dispositivos de arranque de motores de combust~ao interna, o qual serve para a prote$c~ao das m'aquinas el'etricas contra a penetra$c~ao de sujidade e umidade, bem como contra corros~ao. O envolt'orio consiste de pelo menos uma folheta (7) que est'a empurrada ou puxada sobre as pe$cas (2 a 5) da m'aquina el'etrica a serem envolvidas, pode estar colada sobre as pe$cas (2 a 5) e est'a fixamente contra'ida sobre as pe$cas envoltas (2 a 5). Em compara$c~ao com os convencionais revestimentos de verniz, o envolt'orio forma uma prote$c~ao simples, mais favor'avel ambientalmente e mais apropriada para a fabrica$c~ao econ^omica em grandes quantidades."],
 ["Aperfei$coamento em interruptor de luz temporizado anal'ogico/digital",
  "A presente patente se refere a um interruptor temporizado em que 'e poss'ivel optar pelo acionamento digital ou anal'ogico devido ao mesmo ser dotado de uma t

In [48]:
# Binarize the labels
# 0 if it is not part of the class
# 1 otherwise
X = dataset[0]
y = dataset[1]

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [50]:
# Cleaning the texts
# remove everything but letters
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
corpus_title = []
corpus_summary = []
stemmer = SnowballStemmer('portuguese')
for i in range(len(X)):
    title = X[i][0].split()
    summary = X[i][1].split()
    for j in range(len(title)):
        title[j] = re.sub('[^a-zA-Z]', '', title[j])
    for j in range(len(summary)):
        summary[j] = re.sub('[^a-zA-Z]', '', summary[j])
    title = [stemmer.stem(st) for st in title]
    summary = [stemmer.stem(st) for st in summary]
    title = ' '.join(title)
    summary = ' '.join(summary)
    corpus_title.append(title)
    corpus_summary.append(summary)

In [58]:
print('Size of X: '+str(len(X)))

Size of X: 36137


In [59]:
corpus_summary[0]

'e propost em envoltori par maquin eletr sobretud disposit de arranqu de motor de combusta intern o qual serv par a proteca das maquin eletr contr a penetraca de sujidad e umidad bem com contr corrosa o envoltori cons de pel men uma folhet  que esta empurr ou pux sobr as pec  a  da maquin eletr a ser envolv pod estar col sobr as pec  a  e esta fix contra sobr as pec envolt  a  em comparaca com os convencion revest de verniz o envoltori form uma proteca simpl mais favoravel ambiental e mais apropri par a fabricaca econom em grand quantidad'

In [60]:
corpus_summary[1]

'a present patent se refer a um interruptor temporiz em que e possivel optar pel acion digital ou analog dev ao mesm ser dot de uma tecl convencional e um sensor de toqu send que o segund ao ser acion mant a lamp aces com mei potenc por quatr minut cons o conjunt num espelh  e um suport  dupl ident ao convencional send no lug de uma das tecl fix uma chap metal  cont um led  de sinalizaca e atras do suport  localizas uma tamp  acondicion a plac  do circuit eletron'

In [61]:
corpus_title[0]

'envoltori par maquin eletr'

In [62]:
corpus_title[1]

'aperfeico em interruptor de luz temporiz analogicodigital'

In [63]:
# Vectorize titles and summary separately
from sklearn.feature_extraction.text import CountVectorizer
title_cv = CountVectorizer(max_features = None, ngram_range=(1,1))
X_title = title_cv.fit_transform(corpus_title)
summary_cv = CountVectorizer(max_features = None, ngram_range=(1,1))
X_summary = summary_cv.fit_transform(corpus_summary)

In [64]:
# Apply SVD transformation n_components=50 separately
from sklearn.decomposition import TruncatedSVD
svd_title = TruncatedSVD(n_components=50)
X_title = svd_title.fit_transform(X_title)

from sklearn.decomposition import TruncatedSVD
svd_summary = TruncatedSVD(n_components=50)
X_summary = svd_summary.fit_transform(X_summary)

In [65]:
# Concatenate title and summary svd representation
X = np.concatenate((X_title, X_summary), axis=1)

In [66]:
X[0]

array([  2.32604800e-01,   6.73060101e-01,  -5.88471045e-01,
         6.40893505e-02,  -2.30426072e-01,   4.85245911e-02,
         1.30155016e-01,   4.11542101e-02,   5.45377381e-02,
         2.14583337e-02,   1.15036038e-01,  -1.08870731e-01,
        -1.62640018e-01,  -3.98621345e-02,  -5.60066256e-02,
         3.29069923e-03,  -4.55641616e-02,  -4.37927198e-02,
        -2.62002964e-02,   1.46409829e-02,   1.61693794e-02,
        -3.09958428e-02,   6.48476720e-04,   4.49647652e-03,
         1.58714672e-03,   2.12486270e-02,   1.56369946e-02,
         5.35067773e-03,  -7.74857710e-03,   6.78310289e-04,
        -4.07101888e-02,  -2.34295095e-02,  -2.54731827e-02,
         8.37118303e-03,   1.01269843e-02,  -4.11247862e-04,
        -2.51817495e-03,  -3.23585317e-02,   5.07504520e-03,
        -2.15958244e-02,   7.92310193e-03,  -9.16687099e-03,
        -2.30142145e-02,   8.08727251e-04,   4.32303940e-02,
        -3.20153964e-02,   6.17310736e-03,   2.51670813e-02,
        -8.53457085e-03,

In [67]:
# distribution over dataset
for i in range(len(y[0])):
        print('Label '+str(mlb.classes_[i])+ ' has '+ str(np.count_nonzero(y[:, i] == 1))) 
    

Label H01B has 844
Label H01C has 165
Label H01F has 713
Label H01G has 163
Label H01H has 1842
Label H01J has 379
Label H01K has 90
Label H01L has 926
Label H01M has 1377
Label H01P has 153
Label H01Q has 1038
Label H01R has 2471
Label H01S has 126
Label H01T has 280
Label H02B has 476
Label H02C has 1
Label H02G has 1330
Label H02H has 649
Label H02J has 955
Label H02K has 1747
Label H02M has 483
Label H02N has 269
Label H02P has 676
Label H02R has 1
Label H02S has 9
Label H02V has 1
Label H03B has 51
Label H03C has 29
Label H03D has 88
Label H03F has 208
Label H03G has 104
Label H03H has 126
Label H03J has 119
Label H03K has 300
Label H03L has 96
Label H03M has 461
Label H04B has 3587
Label H04D has 1
Label H04G has 1
Label H04H has 253
Label H04J has 836
Label H04K has 122
Label H04L has 4652
Label H04M has 2556
Label H04N has 3214
Label H04Q has 4267
Label H04R has 577
Label H04S has 72
Label H04W has 663
Label H05B has 1666
Label H05C has 58
Label H05F has 37
Label H05G has 29
La

In [68]:
# list with classes
mlb.classes_

array(['H01B', 'H01C', 'H01F', 'H01G', 'H01H', 'H01J', 'H01K', 'H01L',
       'H01M', 'H01P', 'H01Q', 'H01R', 'H01S', 'H01T', 'H02B', 'H02C',
       'H02G', 'H02H', 'H02J', 'H02K', 'H02M', 'H02N', 'H02P', 'H02R',
       'H02S', 'H02V', 'H03B', 'H03C', 'H03D', 'H03F', 'H03G', 'H03H',
       'H03J', 'H03K', 'H03L', 'H03M', 'H04B', 'H04D', 'H04G', 'H04H',
       'H04J', 'H04K', 'H04L', 'H04M', 'H04N', 'H04Q', 'H04R', 'H04S',
       'H04W', 'H05B', 'H05C', 'H05F', 'H05G', 'H05H', 'H05K', 'H61C',
       'H99Z'], dtype=object)

In [91]:
def display_time(seconds, granularity=2):
    result = []
    intervals = (
    ('semanas', 604800),  # 60 * 60 * 24 * 7
    ('dias', 86400),    # 60 * 60 * 24
    ('horas', 3600),    # 60 * 60
    ('minutos', 60),
    ('segundos', 1),
    )

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

import time
import sys

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_similarity_score

hamming_loss_list = []
accuracy_score_list = []
jaccard_similarity_score_list = []

total_tasks = 10
tasks_count = 0
last_time = 0
start_time = time.time()
    
skf = KFold(n_splits = 10, shuffle=True)
for train, test in skf.split(X, y):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    hamming_loss_list.append(hamming_loss(y_test, y_pred))
    accuracy_score_list.append(accuracy_score(y_test, y_pred))
    jaccard_similarity_score_list.append(jaccard_similarity_score(y_test, y_pred))
    #for i in range(len(y_test[0])):
        #print('Label '+ str(mlb.classes_[i]) + ' tem '+ str(np.count_nonzero(y_test[:, i] == 1)))
    tasks_count += 1
    last_time = time.time()
    exec_time = last_time - start_time
    remaining_time = (total_tasks - tasks_count) * (exec_time) / tasks_count
    sys.stdout.write("\rCalculado ... %.2f%%. Tempo execução: %s. Tempo restante estimado: %s" % (((100.0 * tasks_count / total_tasks)), display_time(last_time - start_time), display_time(remaining_time)))
    sys.stdout.flush()

Calculado ... 100.00%. Tempo execução: 5.0 horas, 13.0 minutos. Tempo restante estimado: 1.0 minutos, 28.0 segundos

In [92]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

hamming_loss_list_2 = []
accuracy_score_list_2 = []
jaccard_similarity_score_list_2 = []

total_tasks = 10
tasks_count = 0
last_time = 0
start_time = time.time()
    
skf = KFold(n_splits = 10, shuffle=True)
for train, test in skf.split(X, y):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
    classifier.fit(X_train, y_train)
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    hamming_loss_list_2.append(hamming_loss(y_test, y_pred))
    accuracy_score_list_2.append(accuracy_score(y_test, y_pred))
    jaccard_similarity_score_list_2.append(jaccard_similarity_score(y_test, y_pred))
    #for i in range(len(y_test[0])):
        #print('Label '+ str(mlb.classes_[i]) + ' tem '+ str(np.count_nonzero(y_test[:, i] == 1)))
    tasks_count += 1
    last_time = time.time()
    exec_time = last_time - start_time
    remaining_time = (total_tasks - tasks_count) * (exec_time) / tasks_count
    sys.stdout.write("\rCalculado ... %.2f%%. Tempo execução: %s. Tempo restante estimado: %s" % (((100.0 * tasks_count / total_tasks)), display_time(last_time - start_time), display_time(remaining_time)))
    sys.stdout.flush()

Calculado ... 100.00%. Tempo execução: 4.0 horas, 57.0 minutos. Tempo restante estimado: 9.0 minutos, 45.0 segundos

In [99]:
hamming_loss_list

[0.018272022058466588,
 0.01812638957659783,
 0.018315711803027213,
 0.018213769065719084,
 0.018466198700958263,
 0.018378819211837007,
 0.01836425596365013,
 0.01830621391563603,
 0.018558713417920668,
 0.018077993211647995]

In [100]:
hamming_loss_list_2

[0.018471053117020554,
 0.018262313226342003,
 0.018529306109768055,
 0.018194351401469917,
 0.018170079321158456,
 0.018315711803027213,
 0.01783997902892261,
 0.01838876182984447,
 0.018218810241768274,
 0.018728665005996865]

In [101]:
accuracy_score_list

[0.12672938572219147,
 0.12534587714443829,
 0.1314333148865523,
 0.13558384061981185,
 0.12396236856668512,
 0.12257885998893193,
 0.1261759822910902,
 0.12565734846388044,
 0.11680044284528093,
 0.13063935787434267]

In [102]:
accuracy_score_list_2

[0.12340896513558385,
 0.12396236856668512,
 0.12340896513558385,
 0.12894299944659657,
 0.1314333148865523,
 0.11787493082457111,
 0.13613724405091313,
 0.13008580127318017,
 0.12953224467201771,
 0.11956822585109328]

In [None]:
jaccard_similarity_score_list_2