## Bibliotecas e módulos

In [6]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import numpy as np
import pandas as pd
import csv

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn import svm

#from matplotlib import rcParams
#rcParams['text.usetex'] = True
#rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'

## Funções utilizadas

In [7]:
#Função de leitura dos dados------------------------------------
def read_class_data(path):
    with open(path, newline='') as f:
        reader = csv.reader(f,delimiter=',')
        count = 0
        for row in reader:
            if count != 0:
                try:
                    data = np.vstack( (data , np.asarray(row).astype(np.float) ) )
                except:
                    data = np.asarray(row).astype(np.float)
            count += 1
    f.close()
    y = data[:,0]
    x = data[:,1:]
    return y,x

In [8]:
def compute_info(Z,x,posY):
    m,_ = x.shape
    
    mu = np.mean(x[:,Z],axis=0).reshape((1,len(Z))) #Média geral
    Sb = np.zeros((len(Z),len(Z)))
    Sw = np.zeros((len(Z),len(Z)))
       
    for j in range(0,len(labY)):
        _x = x[posY[j],:]; _x = _x[:,Z]
        mu_j = np.mean(_x,axis=0).reshape((1,len(Z)))
        sig_j = np.cov(_x.T).reshape((len(Z),len(Z)))
        Sb += (len(posY[j])/m) * np.dot( (mu - mu_j).T,(mu - mu_j) )
        Sw += (len(posY[j])/m) * sig_j

    #return np.trace(Sb)/np.trace(Sw)
    return np.linalg.det(Sb+Sw)/np.linalg.det(Sw)

In [9]:
def compute_info_gain_add(Z,W,x,posY):
    
    m,_ = x.shape

    vecJ = np.zeros(len(W))
    for at in range(len(W)):
        S = np.union1d(Z,W[at])
        S = np.int64(S).tolist()

        mu = np.mean(x[:,S],axis=0).reshape((1,len(S))) #Média geral
        Sb = np.zeros((len(S),len(S)))
        Sw = np.zeros((len(S),len(S)))

        for j in range(0,len(posY)):
            _x = x[posY[j],:]; _x = _x[:,S]
            mu_j = np.mean(_x,axis=0).reshape((1,len(S)))
            sig_j = np.cov(_x.T).reshape((len(S),len(S)))

            Sb += (len(posY[j])/m) * np.dot( (mu - mu_j).T,(mu - mu_j) )
            Sw += (len(posY[j])/m) * sig_j

        #vecJ[at] = np.trace(Sb)/np.trace(Sw)
        vecJ[at] = np.linalg.det(Sb+Sw)/np.linalg.det(Sw)
        
    return vecJ

## Leitura dos dados - "Terremoto"
* Atributos são:
    * latitude
    * longitude
    * depth
    * mag
    * gap
    * dmin
    * rms
    * horizontalError
    * depthError
* Classes:
    * 1 - MB; (azul escuro)
    * 2 - MD; (ciano)
    * 3 - ML; (verde)
    * 4 - MW; (laranja)

In [10]:
#Dados
path = 'EartQuakes_USGS_mar-abr2020.csv'

#Leitura dos dados
y,x = read_class_data(path)
y -= 1 #ajustar a classe para iniciar em 0

xOrig = np.copy(x)
x = preprocessing.normalize(xOrig, axis=0, norm='max')

## Processo de seleção de atributos

In [11]:
#Numero de atributos a ser selecionado
atributos = [2,3,4,5,6,7,8]
n_sel = 4

#Conjuntos de atributos: a ideia é passar de W para Z
W = [i for i in atributos]
Z = []

#determina a posição de cada exemplo rotulado em x
labY = {}
for i in np.unique(y):
    pos = np.where(y == i)
    labY[i] = (pos[0]).tolist()

while len(Z) < n_sel:
    ind = np.argmax( compute_info_gain_add(Z,W,x,labY) )
    Z = np.union1d( Z,W[ind] )
    W.remove(W[ind])

print('Atributos selecionados: ',Z)

selAtributos = np.int64(Z)

Atributos selecionados:  [3. 4. 6. 7.]


## Divisão treino/teste

In [12]:
percentAvalia = 0.33
xD, xI, yD, yI = train_test_split(x,y,test_size=percentAvalia,random_state=1)

## Classificação com/sem seleção de atributos

In [13]:
xD_sub = xD[:,selAtributos]
xI_sub = xI[:,selAtributos]

#Instanciação dos classificadores
svmLin = svm.SVC(C=100, kernel='linear',decision_function_shape='ovr')

#Treinamento e avaliação SEM atributos selecionados
svmLin.fit(xD[:,atributos],yD)
yEst = svmLin.predict(xI[:,atributos])
kappa_sem = cohen_kappa_score(yI, yEst)

#Treinamento e avaliação COM atributos selecionados
svmLin.fit(xD_sub,yD)
yEst = svmLin.predict(xI_sub)
kappa_com = cohen_kappa_score(yI, yEst)

print('Kappa sem seleção de atributos: ',kappa_sem)
print('Kappa com seleção de atributos: ',kappa_com)

Kappa sem seleção de atributos:  0.6691578814702948
Kappa com seleção de atributos:  0.6763145368915843
