In [1]:
import pandas as pd
import numpy as np

# DataFrame Calculations

Hoje veremos como criar novas colunas em um DataFrame. Até o momento, já criamos colunas através de condicionais (usando `.loc` ou `np.where`) e através dos métodos `.astype()`, `.map()` e `.fillna()`.

A criação de colunas é extremamente simples: basta lembrarmos que um `DataFrame` se comporta como um dicionário de `Series`! Podemos criar novas colunas como adicionamos chaves à um dicionário: utilizando o operador de *assignment*, `=`.

Para aula de hoje utilizaremos um novo dataset: os dados do artigo *Sleep  in Mammals: Ecological and Constitutional Correlates*, contendo informações sobre o sono e a vida de certos animais.

## Lendo o DataFrame

Vamos iniciar carregando o DataFrame, olhando a documentação e os dados.

Documentação: 
http://lib.stat.cmu.edu/datasets/sleep

In [2]:
tb_animals = pd.read_csv('http://www.statsci.org/data/general/sleep.txt', sep='\t')

In [3]:
tb_animals.describe()

Unnamed: 0,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger
count,62.0,62.0,48.0,50.0,58.0,58.0,58.0,62.0,62.0,62.0
mean,198.789984,283.134194,8.672917,1.972,10.532759,19.877586,142.353448,2.870968,2.419355,2.612903
std,899.158011,930.278942,3.666452,1.442651,4.60676,18.206255,146.805039,1.476414,1.604792,1.441252
min,0.005,0.14,2.1,0.0,2.6,2.0,12.0,1.0,1.0,1.0
25%,0.6,4.25,6.25,0.9,8.05,6.625,35.75,2.0,1.0,1.0
50%,3.3425,17.25,8.35,1.8,10.45,15.1,79.0,3.0,2.0,2.0
75%,48.2025,166.0,11.0,2.55,13.2,27.75,207.5,4.0,4.0,4.0
max,6654.0,5712.0,17.9,6.6,19.9,100.0,645.0,5.0,5.0,5.0


In [4]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger
0,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,3
1,Africangiantpouchedrat,1.0,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3
4,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4


# Calculos com DataFrames

A forma mais simples de criarmos novas colunas é a partir de constantes, listas ou calculos com outras colunas. Vamos ver como realizar cada um desses passos.

## Colunas constantes

Podemos criar um coluna com valor constante simplesmente atribuindo um número à coluna.

In [10]:
tb_animals['Gestation'].mean()

142.35344827586206

In [8]:
tb_animals['new_column'] = tb_animals['Gestation'].mean()

In [9]:
tb_animals.tail()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,new_column
57,Treehyrax,2.0,12.3,4.9,0.5,5.4,7.5,200.0,3,1,3,142.353448
58,Treeshrew,0.104,2.5,13.2,2.6,15.8,2.3,46.0,3,2,2,142.353448
59,Vervet,4.19,58.0,9.7,0.6,10.3,24.0,210.0,4,3,4,142.353448
60,Wateropossum,3.5,3.9,12.8,6.6,19.4,3.0,14.0,2,1,1,142.353448
61,Yellow-belliedmarmot,4.05,17.0,,,,13.0,38.0,3,1,1,142.353448


In [11]:
tb_animals['new_column']

0     142.353448
1     142.353448
2     142.353448
3     142.353448
4     142.353448
         ...    
57    142.353448
58    142.353448
59    142.353448
60    142.353448
61    142.353448
Name: new_column, Length: 62, dtype: float64

In [12]:
tb_animals = tb_animals.drop('new_column', axis = 1)

## Criando colunas com `lists`

Podemos criar uma coluna a partir de uma lista (ou qualquer outro iterável). O Pandas interpretará o iterável como um `Series`, ou seja, cada elemento dele será visto como uma nova linha da nossa tabela. Logo, precisamos que o iterável tenha comprimento igual ao tamanho da nossa tabela.

In [14]:
tb_animals.shape

(62, 11)

In [13]:
[i for i in range(tb_animals.shape[0])]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61]

In [15]:
tb_animals['id_linha'] = [i for i in range(tb_animals.shape[0])]

In [17]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,id_linha
0,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,3,0
1,Africangiantpouchedrat,1.0,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3,1
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1,2
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3,3
4,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4,4


In [18]:
tb_animals = tb_animals.drop(columns = 'id_linha')

In [19]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger
0,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,3
1,Africangiantpouchedrat,1.0,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3
4,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4


In [20]:
tb_animals['erro'] = [1,2,3]

ValueError: Length of values (3) does not match length of index (62)

## Criando colunas à partir de contas

Podemos utilizar os operadores matemáticos para realizar operações sobre as colunas de um DataSet. A operação será mapeada à cada elemento da coluna - como em vetores do Numpy.

In [24]:
tb_animals['BrainWt']

0     5712.0
1        6.6
2       44.5
3        5.7
4     4603.0
       ...  
57      12.3
58       2.5
59      58.0
60       3.9
61      17.0
Name: BrainWt, Length: 62, dtype: float64

In [23]:
tb_animals['BrainWt']/1000

0     5.7120
1     0.0066
2     0.0445
3     0.0057
4     4.6030
       ...  
57    0.0123
58    0.0025
59    0.0580
60    0.0039
61    0.0170
Name: BrainWt, Length: 62, dtype: float64

In [25]:
tb_animals['BrainWt_kg'] = tb_animals['BrainWt']/1000

In [26]:
tb_animals.describe()

Unnamed: 0,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg
count,62.0,62.0,48.0,50.0,58.0,58.0,58.0,62.0,62.0,62.0,62.0
mean,198.789984,283.134194,8.672917,1.972,10.532759,19.877586,142.353448,2.870968,2.419355,2.612903,0.283134
std,899.158011,930.278942,3.666452,1.442651,4.60676,18.206255,146.805039,1.476414,1.604792,1.441252,0.930279
min,0.005,0.14,2.1,0.0,2.6,2.0,12.0,1.0,1.0,1.0,0.00014
25%,0.6,4.25,6.25,0.9,8.05,6.625,35.75,2.0,1.0,1.0,0.00425
50%,3.3425,17.25,8.35,1.8,10.45,15.1,79.0,3.0,2.0,2.0,0.01725
75%,48.2025,166.0,11.0,2.55,13.2,27.75,207.5,4.0,4.0,4.0,0.166
max,6654.0,5712.0,17.9,6.6,19.9,100.0,645.0,5.0,5.0,5.0,5.712


## Cálculos entre Colunas

Podemos realizar operações entre colunas - da mesma forma que os operadores booleanos (`<`, `>`, `==`, etc) podem ser aplicados sobre uma coluna para criar uma coluna, os operadores matemáticos podem ser usados entre duas colunas para criar novas colunas.

In [31]:
tb_animals[['BrainWt_kg', 'BodyWt']]

Unnamed: 0,BrainWt_kg,BodyWt
0,5.7120,6654.000
1,0.0066,1.000
2,0.0445,3.385
3,0.0057,0.920
4,4.6030,2547.000
...,...,...
57,0.0123,2.000
58,0.0025,0.104
59,0.0580,4.190
60,0.0039,3.500


In [34]:
tb_animals['BrainWt_kg']/tb_animals['Species']

TypeError: unsupported operand type(s) for /: 'float' and 'str'

In [35]:
tb_animals['ratio_brain_body'] = tb_animals['BrainWt_kg']/tb_animals['BodyWt']

In [36]:
tb_animals['ratio_brain_body'].describe()

count    62.000000
mean      0.009624
std       0.008915
min       0.000858
25%       0.003103
50%       0.006611
75%       0.013668
max       0.039604
Name: ratio_brain_body, dtype: float64

In [37]:
tb_animals[tb_animals['ratio_brain_body']>0.03]

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body
26,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,3,0.004,0.039604
41,Owlmonkey,0.48,15.5,15.2,1.8,17.0,12.0,140.0,2,2,2,0.0155,0.032292


### Operadores Booleanos entre Colunas

Da mesma forma que podemos realizar a comparação de uma coluna com um valor, podemos criar comparações entre colunas:

In [38]:
tb_animals['ratio_brain_body']>0.01

0     False
1     False
2      True
3     False
4     False
      ...  
57    False
58     True
59     True
60    False
61    False
Name: ratio_brain_body, Length: 62, dtype: bool

In [40]:
tb_animals['dreamers'] = tb_animals['Dreaming'] > tb_animals['NonDreaming']

In [41]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers
0,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,3,5.712,0.000858,False
1,Africangiantpouchedrat,1.0,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3,0.0066,0.0066,False
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1,0.0445,0.013146,False
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3,0.0057,0.006196,False
4,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4,4.603,0.001807,False


In [42]:
tb_animals[tb_animals['dreamers']]

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers


In [43]:
tb_animals[tb_animals['Dreaming'] > tb_animals['NonDreaming']]

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers


## Usando métodos de `strings` em colunas

A aplicação dos métodos de `str` é um pouco mais complexa, sintaticamente, que a utilização dos operadores: precisamos utilizar um atributo das `Series` para conseguir acessar os métodos.

In [44]:
tb_animals['Species'].head()

0           Africanelephant
1    Africangiantpouchedrat
2                 ArcticFox
3      Arcticgroundsquirrel
4             Asianelephant
Name: Species, dtype: object

In [45]:
tb_animals['Species'].lower()

AttributeError: 'Series' object has no attribute 'lower'

Para acessar os métodos de `strings` vamos utilizar o atributo `.str` das `Series`

In [46]:
tb_animals['Species'].str.lower()

0            africanelephant
1     africangiantpouchedrat
2                  arcticfox
3       arcticgroundsquirrel
4              asianelephant
               ...          
57                 treehyrax
58                 treeshrew
59                    vervet
60              wateropossum
61      yellow-belliedmarmot
Name: Species, Length: 62, dtype: object

In [47]:
tb_animals['lower_species'] = tb_animals['Species'].str.lower()

Além dos métodos básicos de `strings` podemos utilizar funções de REGEX também!. A síntaxe é a mesma: utilizaremos o atributo `.str` para acessar esses métodos.

Vamos começar com o método `.contains()` que retorna um vetor booleano determinando se um padrão foi encontrado ou não em cada linha de nossa coluna. 

In [49]:
tb_animals['lower_species'].head()

0           africanelephant
1    africangiantpouchedrat
2                 arcticfox
3      arcticgroundsquirrel
4             asianelephant
Name: lower_species, dtype: object

In [48]:
tb_animals['lower_species'].str.contains(r'monk|ape|man|gorilla|baboon|chimpanzee')

0     False
1     False
2     False
3     False
4     False
      ...  
57    False
58    False
59    False
60    False
61    False
Name: lower_species, Length: 62, dtype: bool

In [50]:
tb_animals['id_primata'] = tb_animals['lower_species'].str.contains(r'monk|ape|man|gorilla|baboon|chimpanzee')

In [51]:
tb_animals[['id_primata', 'Species']]

Unnamed: 0,id_primata,Species
0,False,Africanelephant
1,False,Africangiantpouchedrat
2,False,ArcticFox
3,False,Arcticgroundsquirrel
4,False,Asianelephant
...,...,...
57,False,Treehyrax
58,False,Treeshrew
59,False,Vervet
60,False,Wateropossum


In [52]:
tb_animals.loc[tb_animals['id_primata'], 'Species']

5           Baboon
9       Chimpanzee
23         Gorilla
33             Man
41       Owlmonkey
42     Patasmonkey
49    Rhesusmonkey
Name: Species, dtype: object

In [53]:
sum(tb_animals['id_primata'])

7

Podemos utilizar o método `.findall()` para guardar a informação de qual parte do `string` deu *match* com nosso padrão:

In [54]:
tb_animals['lista_primata'] = tb_animals['lower_species'].str.findall(r'monk|ape|man|gorilla|baboon|chimpanzee')

In [55]:
tb_animals.head(10)

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata
0,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,3,5.712,0.000858,False,africanelephant,False,[]
1,Africangiantpouchedrat,1.0,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3,0.0066,0.0066,False,africangiantpouchedrat,False,[]
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1,0.0445,0.013146,False,arcticfox,False,[]
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3,0.0057,0.006196,False,arcticgroundsquirrel,False,[]
4,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4,4.603,0.001807,False,asianelephant,False,[]
5,Baboon,10.55,179.5,9.1,0.7,9.8,27.0,180.0,4,4,4,0.1795,0.017014,False,baboon,True,[baboon]
6,Bigbrownbat,0.023,0.3,15.8,3.9,19.7,19.0,35.0,1,1,1,0.0003,0.013043,False,bigbrownbat,False,[]
7,Braziliantapir,160.0,169.0,5.2,1.0,6.2,30.4,392.0,4,5,4,0.169,0.001056,False,braziliantapir,False,[]
8,Cat,3.3,25.6,10.9,3.6,14.5,28.0,63.0,1,2,1,0.0256,0.007758,False,cat,False,[]
9,Chimpanzee,52.16,440.0,8.3,1.4,9.7,50.0,230.0,1,1,1,0.44,0.008436,False,chimpanzee,True,[chimpanzee]


O método `.findall()` retorna uma lista: se quisermos transformar essa lista em um string teremos que utilizar o método `.map()`. Vamos começar definindo uma função para selecionar o primeiro elemento de cada lista e utilizar o método `.map()` para aplicar essa função a nossa coluna.

In [57]:
# EXERCICIO
import re
def f(string):
    x = re.findall(r'monk|ape|man|gorilla|baboon|chimpanzee', string)
    if len(x) > 0:
        return x[0]
    else:
        return 'Não Primata'

tb_animals['lower_species'].map(f)

0     Não Primata
1     Não Primata
2     Não Primata
3     Não Primata
4     Não Primata
         ...     
57    Não Primata
58    Não Primata
59    Não Primata
60    Não Primata
61    Não Primata
Name: lower_species, Length: 62, dtype: object

In [58]:
def meu_lower(x):
    return x.lower()

tb_animals['Species'].map(meu_lower)

0            africanelephant
1     africangiantpouchedrat
2                  arcticfox
3       arcticgroundsquirrel
4              asianelephant
               ...          
57                 treehyrax
58                 treeshrew
59                    vervet
60              wateropossum
61      yellow-belliedmarmot
Name: Species, Length: 62, dtype: object

In [60]:
meu_lower("Africanelephant")

'africanelephant'

In [59]:
tb_animals['Species']

0            Africanelephant
1     Africangiantpouchedrat
2                  ArcticFox
3       Arcticgroundsquirrel
4              Asianelephant
               ...          
57                 Treehyrax
58                 Treeshrew
59                    Vervet
60              Wateropossum
61      Yellow-belliedmarmot
Name: Species, Length: 62, dtype: object

## Ordenando valores

Podemos utilizar o método `.sort_values()` para ordenar um DataFrame por uma (ou mais) coluna.

In [63]:
tb_animals.sort_values(by='ratio_brain_body', ascending=False)

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata
26,Groundsquirrel,0.101,4.00,10.4,3.4,13.8,9.0,28.0,5,1,3,0.00400,0.039604,False,groundsquirrel,False,[]
41,Owlmonkey,0.480,15.50,15.2,1.8,17.0,12.0,140.0,2,2,2,0.01550,0.032292,False,owlmonkey,True,[monk]
31,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4,0.00014,0.028000,False,lessershort-tailedshrew,False,[]
49,Rhesusmonkey,6.800,179.00,8.4,1.2,9.6,29.0,164.0,2,3,2,0.17900,0.026324,False,rhesusmonkey,True,[monk]
32,Littlebrownbat,0.010,0.25,17.9,2.0,19.9,24.0,50.0,1,1,1,0.00025,0.025000,False,littlebrownbat,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Wateropossum,3.500,3.90,12.8,6.6,19.4,3.0,14.0,2,1,1,0.00390,0.001114,False,wateropossum,False,[]
7,Braziliantapir,160.000,169.00,5.2,1.0,6.2,30.4,392.0,4,5,4,0.16900,0.001056,False,braziliantapir,False,[]
44,Pig,192.000,180.00,6.5,1.9,8.4,27.0,115.0,4,4,4,0.18000,0.000937,False,pig,False,[]
11,Cow,465.000,423.00,3.2,0.7,3.9,30.0,281.0,5,5,5,0.42300,0.000910,False,cow,False,[]


In [64]:
tb_animals

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata
0,Africanelephant,6654.000,5712.0,,,3.3,38.6,645.0,3,5,3,5.7120,0.000858,False,africanelephant,False,[]
1,Africangiantpouchedrat,1.000,6.6,6.3,2.0,8.3,4.5,42.0,3,1,3,0.0066,0.006600,False,africangiantpouchedrat,False,[]
2,ArcticFox,3.385,44.5,,,12.5,14.0,60.0,1,1,1,0.0445,0.013146,False,arcticfox,False,[]
3,Arcticgroundsquirrel,0.920,5.7,,,16.5,,25.0,5,2,3,0.0057,0.006196,False,arcticgroundsquirrel,False,[]
4,Asianelephant,2547.000,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,4,4.6030,0.001807,False,asianelephant,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,Treehyrax,2.000,12.3,4.9,0.5,5.4,7.5,200.0,3,1,3,0.0123,0.006150,False,treehyrax,False,[]
58,Treeshrew,0.104,2.5,13.2,2.6,15.8,2.3,46.0,3,2,2,0.0025,0.024038,False,treeshrew,False,[]
59,Vervet,4.190,58.0,9.7,0.6,10.3,24.0,210.0,4,3,4,0.0580,0.013842,False,vervet,False,[]
60,Wateropossum,3.500,3.9,12.8,6.6,19.4,3.0,14.0,2,1,1,0.0039,0.001114,False,wateropossum,False,[]


Lembrando que os métodos do DataFrame não alteram o objeto original! Se quisermos guardar nosso resultado precisamos faze-lo explicitamente:

In [65]:
tb_animals = tb_animals.sort_values(by=['Predation', 'ratio_brain_body'], ascending=False)

In [66]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata
26,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,3,0.004,0.039604,False,groundsquirrel,False,[]
31,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4,0.00014,0.028,False,lessershort-tailedshrew,False,[]
10,Chinchilla,0.425,6.4,11.0,1.5,12.5,7.0,112.0,5,4,4,0.0064,0.015059,False,chinchilla,False,[]
52,Roedeer,14.83,98.2,,,2.6,17.0,150.0,5,5,5,0.0982,0.006622,False,roedeer,False,[]
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3,0.0057,0.006196,False,arcticgroundsquirrel,False,[]


## Métodos de agregação entre colunas

Podemos utilizar os métodos de agregação para criar novas colunas: basta mudar o eixo ao longo do qual a operação é realizada!

In [69]:
tb_animals['Predation'].mean(axis = 0)

2.870967741935484

In [67]:
tb_animals[['Predation', 'Exposure', 'Danger']].mean(axis=0)

Predation    2.870968
Exposure     2.419355
Danger       2.612903
dtype: float64

In [71]:
tb_animals[['Predation', 'Exposure', 'Danger']]

Unnamed: 0,Predation,Exposure,Danger
26,5,1,3
31,5,2,4
10,5,4,4
52,5,5,5
3,5,2,3
...,...,...,...
24,1,3,1
25,1,1,1
23,1,4,1
29,1,1,1


In [70]:
tb_animals[['Predation', 'Exposure', 'Danger']].mean(axis=1)

26    3.000000
31    3.666667
10    4.333333
52    5.000000
3     3.333333
        ...   
24    1.666667
25    1.000000
23    2.000000
29    1.000000
19    1.000000
Length: 62, dtype: float64

In [72]:
tb_animals['risco'] = tb_animals[['Predation', 'Exposure', 'Danger']].mean(axis=1)

In [73]:
tb_animals[['Predation', 'Exposure', 'Danger', 'risco']].mean(axis=0)

Predation    2.870968
Exposure     2.419355
Danger       2.612903
risco        2.634409
dtype: float64

# Cálculos Condicionais

Podemos utilizar o atributo `.loc` para criar colunas condicionais. Vamos começar com um exemplo simples: criando uma coluna a partir de uma constante.

## Colunas Condicionais constantes

In [74]:
tb_animals['flag_alto_risco'] = 0

In [75]:
tb_animals

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata,risco,flag_alto_risco
26,Groundsquirrel,0.101,4.00,10.4,3.4,13.8,9.0,28.0,5,1,3,0.00400,0.039604,False,groundsquirrel,False,[],3.000000,0
31,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4,0.00014,0.028000,False,lessershort-tailedshrew,False,[],3.666667,0
10,Chinchilla,0.425,6.40,11.0,1.5,12.5,7.0,112.0,5,4,4,0.00640,0.015059,False,chinchilla,False,[],4.333333,0
52,Roedeer,14.830,98.20,,,2.6,17.0,150.0,5,5,5,0.09820,0.006622,False,roedeer,False,[],5.000000,0
3,Arcticgroundsquirrel,0.920,5.70,,,16.5,,25.0,5,2,3,0.00570,0.006196,False,arcticgroundsquirrel,False,[],3.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,Grayseal,85.000,325.00,4.7,1.5,6.2,41.0,310.0,1,3,1,0.32500,0.003824,False,grayseal,False,[],1.666667,0
25,Graywolf,36.330,119.50,,,13.0,16.2,63.0,1,1,1,0.11950,0.003289,False,graywolf,False,[],1.000000,0
23,Gorilla,207.000,406.00,,,12.0,39.3,252.0,1,4,1,0.40600,0.001961,False,gorilla,True,[gorilla],2.000000,0
29,Jaguar,100.000,157.00,,,10.8,22.4,100.0,1,1,1,0.15700,0.001570,False,jaguar,False,[],1.000000,0


In [76]:
tb_animals.loc[tb_animals['risco']>=4, 'flag_alto_risco'] = 1

In [77]:
tb_animals

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata,risco,flag_alto_risco
26,Groundsquirrel,0.101,4.00,10.4,3.4,13.8,9.0,28.0,5,1,3,0.00400,0.039604,False,groundsquirrel,False,[],3.000000,0
31,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4,0.00014,0.028000,False,lessershort-tailedshrew,False,[],3.666667,0
10,Chinchilla,0.425,6.40,11.0,1.5,12.5,7.0,112.0,5,4,4,0.00640,0.015059,False,chinchilla,False,[],4.333333,1
52,Roedeer,14.830,98.20,,,2.6,17.0,150.0,5,5,5,0.09820,0.006622,False,roedeer,False,[],5.000000,1
3,Arcticgroundsquirrel,0.920,5.70,,,16.5,,25.0,5,2,3,0.00570,0.006196,False,arcticgroundsquirrel,False,[],3.333333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,Grayseal,85.000,325.00,4.7,1.5,6.2,41.0,310.0,1,3,1,0.32500,0.003824,False,grayseal,False,[],1.666667,0
25,Graywolf,36.330,119.50,,,13.0,16.2,63.0,1,1,1,0.11950,0.003289,False,graywolf,False,[],1.000000,0
23,Gorilla,207.000,406.00,,,12.0,39.3,252.0,1,4,1,0.40600,0.001961,False,gorilla,True,[gorilla],2.000000,0
29,Jaguar,100.000,157.00,,,10.8,22.4,100.0,1,1,1,0.15700,0.001570,False,jaguar,False,[],1.000000,0


In [78]:
tb_animals.groupby('flag_alto_risco').mean()

  tb_animals.groupby('flag_alto_risco').mean()


Unnamed: 0_level_0,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,id_primata,risco
flag_alto_risco,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,162.586089,203.836,9.602778,2.366667,11.875,17.214634,103.914634,2.244444,1.577778,1.888889,0.203836,0.011467,0.0,0.111111,1.903704
1,294.623824,493.041176,5.883333,0.957143,6.314286,26.3,235.058824,4.529412,4.647059,4.529412,0.493041,0.004746,0.0,0.117647,4.568627


Um atributo muito útil para esse tipo de visualização é o `.T`: ele nos retorna o DataFrame transposto:

In [79]:
tb_animals.groupby('flag_alto_risco').mean().T

  tb_animals.groupby('flag_alto_risco').mean().T


flag_alto_risco,0,1
BodyWt,162.586089,294.623824
BrainWt,203.836,493.041176
NonDreaming,9.602778,5.883333
Dreaming,2.366667,0.957143
TotalSleep,11.875,6.314286
LifeSpan,17.214634,26.3
Gestation,103.914634,235.058824
Predation,2.244444,4.529412
Exposure,1.577778,4.647059
Danger,1.888889,4.529412


## Colunas Condicionais utilizando operações

In [80]:
tb_animals['max_risco'] = tb_animals[['Predation', 'Exposure', 'Danger']].max(axis = 1)

In [81]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata,risco,flag_alto_risco,max_risco
26,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,3,0.004,0.039604,False,groundsquirrel,False,[],3.0,0,5
31,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4,0.00014,0.028,False,lessershort-tailedshrew,False,[],3.666667,0,5
10,Chinchilla,0.425,6.4,11.0,1.5,12.5,7.0,112.0,5,4,4,0.0064,0.015059,False,chinchilla,False,[],4.333333,1,5
52,Roedeer,14.83,98.2,,,2.6,17.0,150.0,5,5,5,0.0982,0.006622,False,roedeer,False,[],5.0,1,5
3,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,3,0.0057,0.006196,False,arcticgroundsquirrel,False,[],3.333333,0,5


In [82]:
tb_animals.loc[tb_animals['max_risco'] < 5, 'risco_2'] = tb_animals[['Predation', 'Exposure', 'Danger']].mean(axis = 1)
tb_animals.loc[tb_animals['max_risco'] == 5, 'risco_2'] = 5

In [84]:
tb_animals[tb_animals['max_risco'] != 5].head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,...,BrainWt_kg,ratio_brain_body,dreamers,lower_species,id_primata,lista_primata,risco,flag_alto_risco,max_risco,risco_2
36,Mouse,0.023,0.4,11.9,1.3,13.2,3.2,19.0,4,1,...,0.0004,0.017391,False,mouse,False,[],2.666667,0,4,2.666667
5,Baboon,10.55,179.5,9.1,0.7,9.8,27.0,180.0,4,4,...,0.1795,0.017014,False,baboon,True,[baboon],4.0,1,4,4.0
59,Vervet,4.19,58.0,9.7,0.6,10.3,24.0,210.0,4,3,...,0.058,0.013842,False,vervet,False,[],3.666667,0,4,3.666667
42,Patasmonkey,10.0,115.0,10.0,0.9,10.9,20.2,170.0,4,4,...,0.115,0.0115,False,patasmonkey,True,[monk],4.0,1,4,4.0
37,Muskshrew,0.048,0.33,10.8,2.0,12.8,2.0,30.0,4,1,...,0.00033,0.006875,False,muskshrew,False,[],2.666667,0,4,2.666667


In [85]:
tb_animals['flag_alto_risco_2'] = "Baixo Risco"
tb_animals.loc[tb_animals['risco_2']>=4, 'flag_alto_risco_2'] = "Alto Risco"

In [87]:
tb_animals.groupby('flag_alto_risco_2').mean(numeric_only = True).T

flag_alto_risco_2,Alto Risco,Baixo Risco
BodyWt,555.411,16.130439
BrainWt,671.597143,84.165366
NonDreaming,6.335714,9.635294
Dreaming,1.1375,2.364706
TotalSleep,7.283333,11.995
LifeSpan,24.865,17.252632
Gestation,224.547619,95.702703
Predation,4.52381,2.02439
Exposure,4.238095,1.487805
Danger,4.285714,1.756098


In [88]:
tb_animals.loc[tb_animals['max_risco'] < 5, 'bla'] = tb_animals['Danger']/10
tb_animals.loc[tb_animals['max_risco'] == 5, 'bla'] = tb_animals['Danger'] * 5

In [90]:
tb_animals = tb_animals.drop(columns = 'bla')

# Quantis

Os quantis são pontos de corte em uma variável numérica que calculados para que uma % das observações esteja abaixo deste ponto. Por exemplo, o quantil 0.5 (50%, ou *mediana*) da variável `BodyWt` é um número tal que 50% das observações tem `BodyWt` abaixo deste número.

Os quantis mais famosos são os **quartis**:

1. 0.25, ou primeiro quartil, onde 25% das observações estão abaixo do quantil;
1. 0.5, ou mediana, onde 50% das observações estão abaixo do quantil;
1. e 0.75, ou terceiro quartil, onde 75% das observações estão abaixo do quantil.

Além disso, muitas vezes usamos os quantis 0.05 e 0.95 para representar os valores mais altos e mais baixos de uma variável.

In [91]:
tb_animals['BodyWt'].median()

3.3425

In [92]:
np.median(tb_animals['BodyWt'])

3.3425

In [93]:
tb_animals['BodyWt'].quantile(0.5)

3.3425

In [94]:
tb_animals['BodyWt'].quantile([0.25, 0.5, 0.75])

0.25     0.6000
0.50     3.3425
0.75    48.2025
Name: BodyWt, dtype: float64

In [95]:
62*0.25

15.5

Uma utilização comum dos quantis é a **discretização de variáveis continuas**, ou seja, a criação de uma variável categórica (`string`) a partir de uma variável numérica.

In [96]:
q25 = tb_animals['BodyWt'].quantile(0.25)
q50 = tb_animals['BodyWt'].quantile(0.5)
q75 = tb_animals['BodyWt'].quantile(0.75)
print(q25, q50, q75)

0.6000000000000001 3.3425 48.2025


In [98]:
q75

48.2025

In [99]:
tb_animals.loc[tb_animals['BodyWt'] >= q75, 'cat_peso'] = 'Pesados'
tb_animals.loc[tb_animals['BodyWt'] < q75, 'cat_peso'] = 'Médios-Pesados'
tb_animals.loc[tb_animals['BodyWt'] < q50, 'cat_peso'] = 'Leves-Médios'
tb_animals.loc[tb_animals['BodyWt'] < q25, 'cat_peso'] = 'Leves'
tb_animals['cat_peso'].value_counts()

Leves             16
Pesados           16
Médios-Pesados    15
Leves-Médios      15
Name: cat_peso, dtype: int64

In [100]:
tb_animals.groupby('cat_peso').mean().T

  tb_animals.groupby('cat_peso').mean().T


cat_peso,Leves,Leves-Médios,Médios-Pesados,Pesados
BodyWt,0.164125,1.578333,11.4612,757.9225
BrainWt,2.8325,11.133333,73.8,1014.6875
NonDreaming,10.95,8.292308,8.877778,5.34
Dreaming,2.28125,2.046154,1.833333,1.583333
TotalSleep,13.23125,10.793333,10.538462,7.164286
LifeSpan,7.65,15.215385,15.713333,38.26875
Gestation,50.75,67.714286,112.666667,327.2
Predation,2.875,2.666667,2.8,3.125
Exposure,1.4375,1.866667,2.6,3.75
Danger,2.25,2.4,2.6,3.1875


## Categorizando dados

A tarefa acima é tão comum que temos uma função específica para *cortar* uma variável numérica de acordo com seus quantis: a `pd.qcut()`

In [101]:
tb_animals['BodyWt_Interval'] = pd.qcut(tb_animals['BodyWt'], 4, ['Leves', 'Leves-Médios', 'Médios-Pesados', 'Pesados'])

In [108]:
 pd.qcut(tb_animals['BodyWt'], 4).value_counts()

(0.004, 0.6]        16
(48.202, 6654.0]    16
(0.6, 3.342]        15
(3.342, 48.202]     15
Name: BodyWt, dtype: int64

In [102]:
tb_animals['BodyWt_Interval'].value_counts()

Leves             16
Pesados           16
Leves-Médios      15
Médios-Pesados    15
Name: BodyWt_Interval, dtype: int64

In [107]:
tb_animals['Danger'].value_counts(normalize = True)

1    0.306452
2    0.225806
3    0.161290
4    0.161290
5    0.145161
Name: Danger, dtype: float64

Os intervalos entre quantis não são uniforme: no exemplo acima a categoria `Leve` tinha animais entre 0 Kg e 0.6 Kg enquanto a `Médios-Pesados` tinha animais entre 3.3 Kg e 48 Kg! Isso acontece pois ao cortamos através de quantis estamos criando intervalos com número de observações uniforme - por consequencia sacrificamos a uniformidade entre intervalos.

Se quisermos *cortar* uma variável em intervalos iguais podemos utilizar a função `pd.cut`:

In [109]:
tb_animals['cat_risco'] = pd.cut(tb_animals['risco'], 3)

In [113]:
tb_animals['cat_risco'].value_counts()

(0.996, 2.333]    31
(3.667, 5.0]      17
(2.333, 3.667]    14
Name: cat_risco, dtype: int64

In [112]:
pd.cut(tb_animals['BodyWt'], 3).value_counts()

(-6.649, 2218.003]      60
(2218.003, 4436.002]     1
(4436.002, 6654.0]       1
Name: BodyWt, dtype: int64

Podemos utilizar a função `pd.cut` para criar uma categorização a partir de intervalos específicos através do argumento `bins = []`:

In [117]:
tb_animals['cat_risco'] = pd.cut(tb_animals['risco'], bins = [0, 2, 4, 5], labels = ['Risco Baixo', 'Risco Médio', 'Risco Alto'])

In [115]:
pd.cut(tb_animals['risco'], bins = [0, 2, 4, 5])

26    (2, 4]
31    (2, 4]
10    (4, 5]
52    (4, 5]
3     (2, 4]
       ...  
24    (0, 2]
25    (0, 2]
23    (0, 2]
29    (0, 2]
19    (0, 2]
Name: risco, Length: 62, dtype: category
Categories (3, interval[int64, right]): [(0, 2] < (2, 4] < (4, 5]]

# Colunas Agregadas

Vimos na última aula como podemos combinar os métodos `.groupby()` e `.merge()` para criar colunas agregadas em um DataFrame. Vamos aprender como o método `.transform()` facilita essa operação.

Primeiro, vamos construir as colunas `rel_dreaming` e `rel_nondreaming` contendo a quantidade de sono relativo (por tipo de sono) em relação a categoria de risco de cada animal. Primeiro, construíremos estas colunas utilizando os métodos `.groupby()` e `.merge()`:

In [118]:
tb_agg_animals = (
    tb_animals
    .groupby('cat_risco')
    .agg(
        avg_dreaming = ('Dreaming', 'mean'),
        avg_nondreaming = ('NonDreaming', 'mean')
    )
)
tb_agg_animals

Unnamed: 0_level_0,avg_dreaming,avg_nondreaming
cat_risco,Unnamed: 1_level_1,Unnamed: 2_level_1
Risco Baixo,2.679167,9.9375
Risco Médio,1.588235,8.370588
Risco Alto,0.811111,5.071429


In [120]:
tb_animals.merge(tb_agg_animals, on = 'cat_risco').tail()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,...,risco,flag_alto_risco,max_risco,risco_2,flag_alto_risco_2,cat_peso,BodyWt_Interval,cat_risco,avg_dreaming,avg_nondreaming
57,Grayseal,85.0,325.0,4.7,1.5,6.2,41.0,310.0,1,3,...,1.666667,0,3,1.666667,Baixo Risco,Pesados,Pesados,Risco Baixo,2.679167,9.9375
58,Graywolf,36.33,119.5,,,13.0,16.2,63.0,1,1,...,1.0,0,1,1.0,Baixo Risco,Médios-Pesados,Médios-Pesados,Risco Baixo,2.679167,9.9375
59,Gorilla,207.0,406.0,,,12.0,39.3,252.0,1,4,...,2.0,0,4,2.0,Baixo Risco,Pesados,Pesados,Risco Baixo,2.679167,9.9375
60,Jaguar,100.0,157.0,,,10.8,22.4,100.0,1,1,...,1.0,0,1,1.0,Baixo Risco,Pesados,Pesados,Risco Baixo,2.679167,9.9375
61,Giantarmadillo,60.0,81.0,12.0,6.1,18.1,7.0,,1,1,...,1.0,0,1,1.0,Baixo Risco,Pesados,Pesados,Risco Baixo,2.679167,9.9375


In [121]:
tb_animals = tb_animals.merge(tb_agg_animals, on = 'cat_risco')
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,...,risco,flag_alto_risco,max_risco,risco_2,flag_alto_risco_2,cat_peso,BodyWt_Interval,cat_risco,avg_dreaming,avg_nondreaming
0,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,...,3.0,0,5,5.0,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588
1,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,...,3.666667,0,5,5.0,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588
2,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,...,3.333333,0,5,5.0,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588
3,Guineapig,1.04,5.5,7.4,0.8,8.2,7.6,68.0,5,3,...,4.0,1,5,5.0,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588
4,Mouse,0.023,0.4,11.9,1.3,13.2,3.2,19.0,4,1,...,2.666667,0,4,2.666667,Baixo Risco,Leves,Leves,Risco Médio,1.588235,8.370588


As colunas de sono relativo serão calculadas como a proporção entre o sono (por tipo) de cada indivíduo e o sono médio da sua categoria de risco:

In [122]:
tb_animals['rel_dreaming'] = tb_animals['Dreaming']/tb_animals['avg_dreaming']
tb_animals['rel_nondreaming'] = tb_animals['NonDreaming']/tb_animals['avg_nondreaming']
tb_animals[["Species", "cat_risco", "NonDreaming", "Dreaming", "rel_dreaming", "rel_nondreaming"]].dropna().sort_values("rel_dreaming")

Unnamed: 0,Species,cat_risco,NonDreaming,Dreaming,rel_dreaming,rel_nondreaming
40,Echidna,Risco Baixo,8.6,0.0,0.0,0.865409
15,Rockhyrax(Procaviahab),Risco Médio,4.9,0.5,0.314815,0.585383
13,Treehyrax,Risco Médio,4.9,0.5,0.314815,0.585383
37,Rockhyrax(Heterob),Risco Baixo,5.7,0.9,0.335925,0.573585
6,Vervet,Risco Médio,9.7,0.6,0.377778,1.158819
5,Baboon,Risco Médio,9.1,0.7,0.440741,1.08714
35,Galago,Risco Baixo,9.5,1.2,0.4479,0.955975
53,Genet,Risco Baixo,4.8,1.3,0.485226,0.483019
3,Guineapig,Risco Médio,7.4,0.8,0.503704,0.884048
55,Chimpanzee,Risco Baixo,8.3,1.4,0.522551,0.83522


Agora, vamos utilizar o método `.transform()` para simplificar o processo acima:

In [123]:
tb_animals.\
    groupby('cat_risco')[["Dreaming"]].\
    transform('mean')
    

Unnamed: 0,Dreaming
0,1.588235
1,1.588235
2,1.588235
3,1.588235
4,1.588235
...,...
57,2.679167
58,2.679167
59,2.679167
60,2.679167


O método `.transform()` cria uma coluna contendo, por observação, o resultado da função de agregação sobre os grupos especificados no método `.groupby()`! Não precisamos realizar a transformação e depois obter os resultados da união das tabelas:

In [124]:
tb_animals.\
    groupby('cat_risco')[["Dreaming", "NonDreaming"]].\
    transform('mean')
    

Unnamed: 0,Dreaming,NonDreaming
0,1.588235,8.370588
1,1.588235,8.370588
2,1.588235,8.370588
3,1.588235,8.370588
4,1.588235,8.370588
...,...,...
57,2.679167,9.937500
58,2.679167,9.937500
59,2.679167,9.937500
60,2.679167,9.937500


In [125]:
tb_animals[["avg_dreaming_t", "avg_nondreaming_t"]] = (
    tb_animals
    .groupby('cat_risco')[["Dreaming", "NonDreaming"]]
    .transform('mean')
)

In [126]:
tb_animals[["Species", "avg_dreaming", "avg_nondreaming", "avg_dreaming_t", "avg_nondreaming_t"]]

Unnamed: 0,Species,avg_dreaming,avg_nondreaming,avg_dreaming_t,avg_nondreaming_t
0,Groundsquirrel,1.588235,8.370588,1.588235,8.370588
1,Lessershort-tailedshrew,1.588235,8.370588,1.588235,8.370588
2,Arcticgroundsquirrel,1.588235,8.370588,1.588235,8.370588
3,Guineapig,1.588235,8.370588,1.588235,8.370588
4,Mouse,1.588235,8.370588,1.588235,8.370588
...,...,...,...,...,...
57,Grayseal,2.679167,9.937500,2.679167,9.937500
58,Graywolf,2.679167,9.937500,2.679167,9.937500
59,Gorilla,2.679167,9.937500,2.679167,9.937500
60,Jaguar,2.679167,9.937500,2.679167,9.937500


# Bonus: 

## Correlation

*Touching statistics*

# Voltamos 10h55

In [127]:
tb_animals.corr()

  tb_animals.corr()


Unnamed: 0,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,Danger,...,risco,flag_alto_risco,max_risco,risco_2,avg_dreaming,avg_nondreaming,rel_dreaming,rel_nondreaming,avg_dreaming_t,avg_nondreaming_t
BodyWt,1.0,0.934164,-0.375946,-0.109383,-0.307186,0.302451,0.651102,0.059495,0.338274,0.133581,...,0.197911,0.066044,0.261448,0.283205,-0.1492582,-0.08945973,0.002619082,-0.3594816,-0.1492582,-0.08945973
BrainWt,0.934164,1.0,-0.369218,-0.105139,-0.358102,0.509253,0.747242,0.033855,0.3678,0.145879,...,0.204463,0.139818,0.256504,0.284774,-0.148655,-0.09128231,-0.03707659,-0.3781892,-0.148655,-0.09128231
NonDreaming,-0.375946,-0.369218,1.0,0.514254,0.962715,-0.384432,-0.594703,-0.318185,-0.543757,-0.483852,...,-0.488397,-0.44392,-0.432609,-0.455507,0.4295286,0.4548739,0.3666823,0.8615049,0.4295286,0.4548739
Dreaming,-0.109383,-0.105139,0.514254,1.0,0.727087,-0.295745,-0.450899,-0.447471,-0.537225,-0.579337,...,-0.560928,-0.443143,-0.476951,-0.506142,0.5110514,0.4846969,0.8208227,0.3162501,0.5110514,0.4846969
TotalSleep,-0.307186,-0.358102,0.962715,0.727087,1.0,-0.410202,-0.631326,-0.395835,-0.642285,-0.587742,...,-0.592435,-0.521043,-0.494504,-0.53008,0.5323181,0.5546292,0.5545155,0.7893934,0.5323181,0.5546292
LifeSpan,0.302451,0.509253,-0.384432,-0.295745,-0.410202,1.0,0.614849,-0.102544,0.360352,0.061778,...,0.123375,0.229133,0.072078,0.117312,-0.06703034,-0.11032,-0.3065073,-0.4045496,-0.06703034,-0.11032
Gestation,0.651102,0.747242,-0.594703,-0.450899,-0.631326,0.614849,1.0,0.200504,0.638279,0.378617,...,0.449642,0.410179,0.407577,0.43518,-0.3852956,-0.4065527,-0.2984915,-0.522305,-0.3852956,-0.4065527
Predation,0.059495,0.033855,-0.318185,-0.447471,-0.395835,-0.102544,0.200504,1.0,0.618246,0.916042,...,0.91087,0.696052,0.905051,0.909189,-0.8856611,-0.8325245,0.0008516775,0.05449282,-0.8856611,-0.8325245
Exposure,0.338274,0.3678,-0.543757,-0.537225,-0.642285,0.360352,0.638279,0.618246,1.0,0.787203,...,0.87801,0.860177,0.778665,0.822943,-0.7474585,-0.7880362,-0.2043277,-0.2065342,-0.7474585,-0.7880362
Danger,0.133581,0.145879,-0.483852,-0.579337,-0.587742,0.061778,0.378617,0.916042,0.787203,1.0,...,0.975345,0.823986,0.879637,0.928268,-0.9292869,-0.8915859,-0.1487247,-0.1151935,-0.9292869,-0.8915859


In [128]:
tb_animals.head()

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,...,flag_alto_risco_2,cat_peso,BodyWt_Interval,cat_risco,avg_dreaming,avg_nondreaming,rel_dreaming,rel_nondreaming,avg_dreaming_t,avg_nondreaming_t
0,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,...,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588,2.140741,1.242446,1.588235,8.370588
1,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,...,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588,0.881481,0.919888,1.588235,8.370588
2,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,...,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588,,,1.588235,8.370588
3,Guineapig,1.04,5.5,7.4,0.8,8.2,7.6,68.0,5,3,...,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588,0.503704,0.884048,1.588235,8.370588
4,Mouse,0.023,0.4,11.9,1.3,13.2,3.2,19.0,4,1,...,Baixo Risco,Leves,Leves,Risco Médio,1.588235,8.370588,0.818519,1.421644,1.588235,8.370588


In [129]:
mask = tb_animals['flag_alto_risco_2'] == "Alto Risco"
tb_animals[mask]

Unnamed: 0,Species,BodyWt,BrainWt,NonDreaming,Dreaming,TotalSleep,LifeSpan,Gestation,Predation,Exposure,...,flag_alto_risco_2,cat_peso,BodyWt_Interval,cat_risco,avg_dreaming,avg_nondreaming,rel_dreaming,rel_nondreaming,avg_dreaming_t,avg_nondreaming_t
0,Groundsquirrel,0.101,4.0,10.4,3.4,13.8,9.0,28.0,5,1,...,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588,2.140741,1.242446,1.588235,8.370588
1,Lessershort-tailedshrew,0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,...,Alto Risco,Leves,Leves,Risco Médio,1.588235,8.370588,0.881481,0.919888,1.588235,8.370588
2,Arcticgroundsquirrel,0.92,5.7,,,16.5,,25.0,5,2,...,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588,,,1.588235,8.370588
3,Guineapig,1.04,5.5,7.4,0.8,8.2,7.6,68.0,5,3,...,Alto Risco,Leves-Médios,Leves-Médios,Risco Médio,1.588235,8.370588,0.503704,0.884048,1.588235,8.370588
5,Baboon,10.55,179.5,9.1,0.7,9.8,27.0,180.0,4,4,...,Alto Risco,Médios-Pesados,Médios-Pesados,Risco Médio,1.588235,8.370588,0.440741,1.08714,1.588235,8.370588
7,Patasmonkey,10.0,115.0,10.0,0.9,10.9,20.2,170.0,4,4,...,Alto Risco,Médios-Pesados,Médios-Pesados,Risco Médio,1.588235,8.370588,0.566667,1.194659,1.588235,8.370588
9,Pig,192.0,180.0,6.5,1.9,8.4,27.0,115.0,4,4,...,Alto Risco,Pesados,Pesados,Risco Médio,1.588235,8.370588,1.196296,0.776528,1.588235,8.370588
16,Asianelephant,2547.0,4603.0,2.1,1.8,3.9,69.0,624.0,3,5,...,Alto Risco,Pesados,Pesados,Risco Médio,1.588235,8.370588,1.133333,0.250878,1.588235,8.370588
17,Kangaroo,35.0,56.0,,,,16.3,33.0,3,5,...,Alto Risco,Médios-Pesados,Médios-Pesados,Risco Médio,1.588235,8.370588,,,1.588235,8.370588
18,Africanelephant,6654.0,5712.0,,,3.3,38.6,645.0,3,5,...,Alto Risco,Pesados,Pesados,Risco Médio,1.588235,8.370588,,,1.588235,8.370588


In [133]:
tb_animals.loc[mask, "Species"]

0              Groundsquirrel
1     Lessershort-tailedshrew
2        Arcticgroundsquirrel
3                   Guineapig
5                      Baboon
7                 Patasmonkey
9                         Pig
16              Asianelephant
17                   Kangaroo
18            Africanelephant
20                 Chinchilla
21                    Roedeer
22                     Rabbit
23                       Goat
24                      Sheep
25                     Donkey
26                      Okapi
27                    Giraffe
28                      Horse
29                        Cow
30             Braziliantapir
Name: Species, dtype: object

In [134]:
tb_animals.columns

Index(['Species', 'BodyWt', 'BrainWt', 'NonDreaming', 'Dreaming', 'TotalSleep',
       'LifeSpan', 'Gestation', 'Predation', 'Exposure', 'Danger',
       'BrainWt_kg', 'ratio_brain_body', 'dreamers', 'lower_species',
       'id_primata', 'lista_primata', 'risco', 'flag_alto_risco', 'max_risco',
       'risco_2', 'flag_alto_risco_2', 'cat_peso', 'BodyWt_Interval',
       'cat_risco', 'avg_dreaming', 'avg_nondreaming', 'rel_dreaming',
       'rel_nondreaming', 'avg_dreaming_t', 'avg_nondreaming_t'],
      dtype='object')

In [135]:
tb_animals.loc[mask, "teste"] = 1
tb_animals.loc[~mask, "teste"] = 0

In [141]:
mask_100 = tb_animals['BodyWt'] <= 0.100
mask_500 = tb_animals['BodyWt'] <= 0.500
mask_5000 = tb_animals['BodyWt'] <= 5

tb_animals['teste2'] = 0
tb_animals.loc[mask_5000, 'teste2'] = 1
tb_animals.loc[mask_500, 'teste2'] = 2
tb_animals.loc[mask_100, 'teste2'] = 3

In [None]:
tb_animals.loc[tb['BodyWt'] > 5, 'teste3'] = 0
tb_animals.loc[(tb['BodyWt'] <= 5) & (tb['BodyWt'] > 0.500), 'teste3'] = 1
tb_animals.loc[(tb['BodyWt'] <= 0.5) & (tb['BodyWt'] > 0.1), 'teste3'] = 2
tb_animals.loc[tb['BodyWt'] <= 0.1, 'teste3'] = 3

In [None]:
# maskA -> resultado A
#
maskA = ??
.loc[maskA, 'nome_coluna'] = A