# Tratamento de dados do multiple features dataset

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from google.colab import drive
import argparse
import sys,os
from sklearn import preprocessing
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
import seaborn as sns

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  import pandas.util.testing as tm


In [2]:
## Importando os dados

dir_remoto = "/content/gdrive/My Drive/"
dir_local = os.getcwd() # path para rodar em máquina local ao invés do colab

kar = pd.read_csv(os.path.join(dir_remoto,'data/mfeat-kar'),  delim_whitespace=True, header= None)
fou = pd.read_csv(os.path.join(dir_remoto,'data/mfeat-fou'),  delim_whitespace=True, header= None)
fac = pd.read_csv(os.path.join(dir_remoto,'data/mfeat-fac'),  delim_whitespace=True, header= None)

print("mfeat-kar")
print(kar.shape, kar.head())
print("mfeat-fou")
print(fou.shape, fou.head())
print("mfeat-fac")
print(fac.shape, fac.head())

mfeat-kar
(2000, 64)           0          1          2   ...        61        62        63
0 -10.297008 -11.666789  11.560669  ...  1.814691 -1.351353 -0.473910
1  -5.036009 -12.885333   0.161155  ...  0.485792  0.642451  0.613107
2  -9.639157  -6.655898   0.388687  ...  1.097748  0.827182 -1.767840
3  -6.650375  -7.043851   4.104350  ... -0.913552 -0.771735  0.304992
4 -10.664524 -10.974133   0.194391  ...  0.298318 -0.943213  1.149847

[5 rows x 64 columns]
mfeat-fou
(2000, 76)          0         1         2   ...        73        74        75
0  0.065882  0.197312  0.103826  ...  0.394366  0.049971  0.344871
1  0.049142  0.175971  0.105515  ...  0.445277  0.083978  0.354092
2  0.034172  0.227649  0.108766  ...  0.445029  0.071234  0.261465
3  0.062336  0.217979  0.080243  ...  0.408291  0.063010  0.401376
4  0.061970  0.198358  0.111239  ...  0.434701  0.069218  0.405403

[5 rows x 76 columns]
mfeat-fac
(2000, 216)    0    1    2    3    4    5    6    ...  209  210  211  212  213  

## Normalizando os dados

In [3]:
def normalization(data):
  x = data.values #returns a numpy array
  min_max_scaler = preprocessing.MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  datapd = pd.DataFrame(x_scaled)
  return datapd


kar = normalization(kar)
fou = normalization(fou)
fac = normalization(fac)

print("mfeat-kar")
print(kar.shape, kar.head())
print("mfeat-fou")
print(fou.shape, fou.head())
print("mfeat-fac")
print(fac.shape, fac.head())

mfeat-kar
(2000, 64)          0         1         2   ...        61        62        63
0  0.191173  0.128265  0.889728  ...  0.836661  0.219735  0.433051
1  0.354385  0.089040  0.498033  ...  0.618422  0.565718  0.595312
2  0.211582  0.289563  0.505851  ...  0.718921  0.597774  0.239905
3  0.304303  0.277075  0.633523  ...  0.388613  0.320316  0.549319
4  0.179772  0.150561  0.499175  ...  0.587634  0.290560  0.675432

[5 rows x 64 columns]
mfeat-fou
(2000, 76)          0         1         2   ...        73        74        75
0  0.155955  0.247037  0.148847  ...  0.628352  0.095886  0.588917
1  0.113125  0.219041  0.151407  ...  0.711892  0.162660  0.604874
2  0.074823  0.286835  0.156336  ...  0.711485  0.137636  0.444575
3  0.146884  0.274150  0.113103  ...  0.651201  0.121489  0.686703
4  0.145945  0.248410  0.160085  ...  0.694537  0.133679  0.693673

[5 rows x 76 columns]
mfeat-fac
(2000, 216)         0         1         2    ...       213       214       215
0  0.069196  0.3202

# Matriz de dissimilaridade

In [4]:
def dissimilaridade(data):
  pairwise = pd.DataFrame(
      squareform(pdist(data)),
      columns = data.index,
      index = data.index
  )
  return pairwise

# default metric: euclidian distance para a função pdist

matriz_kar = dissimilaridade(kar)
matriz_fou = dissimilaridade(fou)
matriz_fac = dissimilaridade(fac)


print("mfeat-kar")
print(matriz_kar.shape, matriz_kar)
print("mfeat-fou")
print(matriz_fou.shape, matriz_fou)
print("mfeat-fac")
print(matriz_fac.shape, matriz_fac)

mfeat-kar
(2000, 2000)           0         1         2     ...      1997      1998      1999
0     0.000000  1.270193  1.397034  ...  1.605476  1.375733  1.655586
1     1.270193  0.000000  1.470758  ...  1.616383  1.314507  1.666247
2     1.397034  1.470758  0.000000  ...  1.495470  1.353730  1.698088
3     1.330976  1.379091  1.438538  ...  1.469890  1.398961  1.585118
4     1.333689  1.141953  1.638747  ...  1.773640  1.408295  1.821250
...        ...       ...       ...  ...       ...       ...       ...
1995  1.589321  1.776687  1.552311  ...  1.389383  1.157987  1.394733
1996  1.567723  1.378748  1.741908  ...  1.686252  1.467261  1.342234
1997  1.605476  1.616383  1.495470  ...  0.000000  1.246467  1.742059
1998  1.375733  1.314507  1.353730  ...  1.246467  0.000000  1.466838
1999  1.655586  1.666247  1.698088  ...  1.742059  1.466838  0.000000

[2000 rows x 2000 columns]
mfeat-fou
(2000, 2000)           0         1         2     ...      1997      1998      1999
0     0.000000  

In [5]:
## salvando as matrizes

matriz_kar.to_csv(os.path.join(dir_remoto,'data/matriz_kar.csv'), sep = ";")
matriz_fou.to_csv(os.path.join(dir_remoto,'data/matriz_fou.csv'), sep = ";")
matriz_fac.to_csv(os.path.join(dir_remoto,'data/matriz_fac.csv'), sep = ";")


## Criando classes

In [6]:
# Criando as classes para cada dataset

finais = []
for i in [kar, fou, fac]:
  # Separando as classes
  c_200 = i[0:200]
  c_400 = i[200:400]
  c_600 = i[400:600]
  c_800 = i[600:800]
  c_1000 = i[800:1000]
  c_1200 = i[1000:1200]
  c_1400 = i[1200:1400]
  c_1600 = i[1400:1600]
  c_1800 = i[1600:1800]
  c_2000 = i[1800:2000]

  # Atribuindo as classes
  c_200["class"] = 0
  c_400["class"] = 1
  c_600["class"] = 2
  c_800["class"] = 3
  c_1000["class"] = 4
  c_1200["class"] = 5
  c_1400["class"] = 6
  c_1600["class"] = 7
  c_1800["class"] = 8
  c_2000["class"] = 9

  # juntando o dataset novamente
  lista_datasets = [c_400,c_600,c_800,c_1000,c_1200,c_1400,c_1600,c_1800,c_2000]
  final = c_200
  for j in lista_datasets:
    final = final.append(j)
 # salvando datasets finais
  finais.append(final)

kar_final = finais[0]
fou_final = finais[1]
fac_final = finais[2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [7]:
print("mfeat-kar")
print(kar_final.shape, kar_final.head())
print("mfeat-fou")
print(fou_final.shape, fou_final.head())
print("mfeat-fac")
print(fac_final.shape, fac_final.head())

mfeat-kar
(2000, 65)           0         1         2         3  ...        61        62        63  class
0  0.191173  0.128265  0.889728  0.392640  ...  0.836661  0.219735  0.433051      0
1  0.354385  0.089040  0.498033  0.478535  ...  0.618422  0.565718  0.595312      0
2  0.211582  0.289563  0.505851  0.404323  ...  0.718921  0.597774  0.239905      0
3  0.304303  0.277075  0.633523  0.384240  ...  0.388613  0.320316  0.549319      0
4  0.179772  0.150561  0.499175  0.474083  ...  0.587634  0.290560  0.675432      0

[5 rows x 65 columns]
mfeat-fou
(2000, 77)           0         1         2         3  ...        73        74        75  class
0  0.155955  0.247037  0.148847  0.352688  ...  0.628352  0.095886  0.588917      0
1  0.113125  0.219041  0.151407  0.294052  ...  0.711892  0.162660  0.604874      0
2  0.074823  0.286835  0.156336  0.159348  ...  0.711485  0.137636  0.444575      0
3  0.146884  0.274150  0.113103  0.378748  ...  0.651201  0.121489  0.686703      0
4  0.145945