# Algumas formas de usar filtros

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("titanic_train.csv")

## 1 - Usando Mascaras Boolenas ( Boolean Mask)

filtrando os casos em que a "Fare" é maior que a **media**:

In [3]:
df[df["Fare"] > df["Fare"].mean()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,C23 C25 C27,S
...,...,...,...,...,...,...,...,...,...,...,...,...
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
867,868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.4958,A24,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S


## 2 - loc[ ]

o método .loc[ ] pode ser usado para filtrar pelo criterio e escolher as colunas q serão exibidas.   
sintaxe: df.loc [ criterio, colunas ] 

In [4]:
criterio = (df["Fare"] > df["Fare"].mean())
colunas = ['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'Fare', 'Survived']
df.loc[criterio]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,C23 C25 C27,S
...,...,...,...,...,...,...,...,...,...,...,...,...
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
867,868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.4958,A24,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S


In [5]:
df.loc[criterio, colunas]

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,Fare,Survived
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,71.2833,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,53.1000,1
6,7,"McCarthy, Mr. Timothy J",1,male,54.0,51.8625,0
23,24,"Sloper, Mr. William Thompson",1,male,28.0,35.5000,1
27,28,"Fortune, Mr. Charles Alexander",1,male,19.0,263.0000,0
...,...,...,...,...,...,...,...
856,857,"Wick, Mrs. George Dennick (Mary Hitchcock)",1,female,45.0,164.8667,1
863,864,"Sage, Miss. Dorothy Edith ""Dolly""",3,female,,69.5500,0
867,868,"Roebling, Mr. Washington Augustus II",1,male,31.0,50.4958,0
871,872,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",1,female,47.0,52.5542,1


In [6]:
def percent_surv_per_fare_range(x = 0):
    #x = quantas vezes a media será acrescida
    criterio_xmedia = (df["Fare"] > (df["Fare"].mean()+(df["Fare"].mean()*x)))
    df_temp = df.loc[criterio_xmedia, colunas]
    return print(f'Para {x} vezes a quantidade da "Fare" média, a média de sobreviventes é {round(df_temp.Survived.mean()*100)}%')

In [7]:
percent_surv_per_fare_range(7)

Para 7 vezes a quantidade da "Fare" média, a média de sobreviventes é 78%


## 3 - Query  
método .query( )  
Recebe a query sempre entre aspas, que pode ser simples ou duplas  
Posso usar booleanos dentro da query, assim como também AND ( & ), OR ( | ), etc  
>é permitido **inplace = True**

In [15]:
df.query('Age > 75')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S


In [20]:
df.query("Survived == 1").shape

(342, 12)

## iloc[ ]  
Usa o índice para relacionar **linhas** e **colunas**  
Sintaxe:  df . iloc [ linhas , colunas ]

Como originalmente o dataFrame é indexado numericamente, por padrão se passa números para o iloc  
> Intervalo = 1 : 5   
>>Apenas 2 ou mais indices, passo dentro de uma lista: [ 1 , 5 , ...]

In [24]:
df.iloc[45, 3]

'Rogers, Mr. William John'

## .set_index()  
O método .set_index() faz com que uma dada coluna passe a ser usada como indice.

In [35]:
df_ind = df.set_index('PassengerId')

In [36]:
df_ind.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
df_ind.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            882, 883, 884, 885, 886, 887, 888, 889, 890, 891],
           dtype='int64', name='PassengerId', length=891)

In [40]:
df_ind.iloc[45, 1: ]

Pclass                             3
Name        Rogers, Mr. William John
Sex                             male
Age                              NaN
SibSp                              0
Parch                              0
Ticket               S.C./A.4. 23567
Fare                            8.05
Cabin                            NaN
Embarked                           S
Name: 46, dtype: object