<a href="https://colab.research.google.com/github/rafaart/DataScienceCO/blob/develop/cie%CC%82ncia_de_dados_ba%CC%81sica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster.vq import whiten

## Carregando datasets de exemplo

### IRIS

Conjunto formado por medições de altura e largura da pétala e da sépala de flores da espécie IRIS.

- 4 variáveis aleatorias
- 150 instâncias / realizações destas variáveis

In [None]:
D_iris = sns.load_dataset('iris')

In [None]:
D_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Diamonds

Conjunto fomado para analise do corte, cor, claridade, preço e outros atributos

- 10 variáveis aleatórias
- 53940 instâncias / realizações

In [None]:
D_diamonds = sns.load_dataset('diamonds')

In [None]:
D_diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


# Algumas analises estatísticas

### Média

In [None]:
D_iris.mean(numeric_only=True)

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

### Desvio padrão

In [None]:
D_iris.std(numeric_only=True)

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

### Moda

In [None]:
D_iris.mode(numeric_only=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.0,3.0,1.4,0.2
1,,,1.5,


### Mediana

In [None]:
D_iris.median(numeric_only=True)

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

### Quartil

In [None]:
D_iris.quantile(0.25, numeric_only=True)

sepal_length    5.1
sepal_width     2.8
petal_length    1.6
petal_width     0.3
Name: 0.25, dtype: float64

### Correlações

In [None]:
D_iris.cov(numeric_only=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,0.685694,-0.042434,1.274315,0.516271
sepal_width,-0.042434,0.189979,-0.329656,-0.121639
petal_length,1.274315,-0.329656,3.116278,1.295609
petal_width,0.516271,-0.121639,1.295609,0.581006


## Exemplos de normalização

### Padronização

In [None]:
D_iris_stdNorm = (D_iris - D_iris.mean(numeric_only=True))/D_iris.std(numeric_only=True)

In [None]:
D_iris_stdNorm

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,species
0,-1.335752,-1.311052,-0.897674,1.015602,
1,-1.335752,-1.311052,-1.139200,-0.131539,
2,-1.392399,-1.311052,-1.380727,0.327318,
3,-1.279104,-1.311052,-1.501490,0.097889,
4,-1.335752,-1.311052,-1.018437,1.245030,
...,...,...,...,...,...
145,0.816859,1.443994,1.034539,-0.131539,
146,0.703564,0.919223,0.551486,-1.278680,
147,0.816859,1.050416,0.793012,-0.131539,
148,0.930154,1.443994,0.430722,0.786174,


In [None]:
D_iris_stdNorm.cov(numeric_only=True)

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width
petal_length,1.0,0.962865,0.871754,-0.42844
petal_width,0.962865,1.0,0.817941,-0.366126
sepal_length,0.871754,0.817941,1.0,-0.11757
sepal_width,-0.42844,-0.366126,-0.11757,1.0


### Normalização (hard limits)

In [None]:
D_iris_hardLim = (D_iris - D_iris.min(numeric_only=True))/(D_iris.max(numeric_only=True) - D_iris.min(numeric_only=True))

In [None]:
D_iris_hardLim

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width,species
0,0.067797,0.041667,0.222222,0.625000,
1,0.067797,0.041667,0.166667,0.416667,
2,0.050847,0.041667,0.111111,0.500000,
3,0.084746,0.041667,0.083333,0.458333,
4,0.067797,0.041667,0.194444,0.666667,
...,...,...,...,...,...
145,0.711864,0.916667,0.666667,0.416667,
146,0.677966,0.750000,0.555556,0.208333,
147,0.711864,0.791667,0.611111,0.416667,
148,0.745763,0.916667,0.527778,0.583333,


In [None]:
D_iris_hardLim.cov(numeric_only=True)

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width
petal_length,0.089522,0.091498,0.059996,-0.023281
petal_width,0.091498,0.100869,0.059754,-0.021118
sepal_length,0.059996,0.059754,0.052908,-0.004911
sepal_width,-0.023281,-0.021118,-0.004911,0.032983


### Branqueamento

In [None]:
D_white_np = whiten(D_iris.to_numpy()[:,[0,1,2,3]].astype(np.float32))
D_white = pd.DataFrame(D_white_np)

In [None]:
D_white.cov()

Unnamed: 0,0,1,2,3
0,1.006711,-0.118359,0.877604,0.823431
1,-0.118359,1.006711,-0.431316,-0.368583
2,0.877604,-0.431316,1.006711,0.969328
3,0.823431,-0.368583,0.969328,1.006711
