# Dataframe Basics II

## Filtering dataframes with one condition

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv("titanic.csv")

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [4]:
titanic.sex.head(10)

0      male
1    female
2    female
3    female
4      male
5      male
6      male
7      male
8    female
9    female
Name: sex, dtype: object

In [6]:
titanic.sex == "male"  # panda series

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: sex, Length: 891, dtype: bool

In [11]:
titanic[titanic.sex == "male"]["fare"]  # getting all rows with sex= 'male' and column fare, this is called chained indexing and not recommended

0       7.2500
4       8.0500
5       8.4583
6      51.8625
7      21.0750
        ...   
883    10.5000
884     7.0500
886    13.0000
889    30.0000
890     7.7500
Name: fare, Length: 577, dtype: float64

In [10]:
titanic.loc[titanic.sex == "male", "fare"] # works same as above, this is recommended

0       7.2500
4       8.0500
5       8.4583
6      51.8625
7      21.0750
        ...   
883    10.5000
884     7.0500
886    13.0000
889    30.0000
890     7.7500
Name: fare, Length: 577, dtype: float64

In [12]:
mask1 = titanic.sex == 'male'
mask1

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: sex, Length: 891, dtype: bool

In [14]:
titanic_male = titanic.loc[mask1]

In [15]:
titanic_male.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,


In [16]:
titanic.dtypes

survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
deck         object
dtype: object

In [17]:
mask2 = titanic.dtypes == object     #filering with dtype object
mask2

survived    False
pclass      False
sex          True
age         False
sibsp       False
parch       False
fare        False
embarked     True
deck         True
dtype: bool

In [18]:
titanic.loc[:, ~mask2]   # ~ sis opposite of mask2, so all columns with numerical values

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.2500
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.9250
3,1,1,35.0,1,0,53.1000
4,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000
887,1,1,19.0,0,0,30.0000
888,0,3,,1,2,23.4500
889,1,1,26.0,0,0,30.0000


In [20]:
titanic.loc[mask1, ~mask2]  # combining both

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.2500
4,0,3,35.0,0,0,8.0500
5,0,3,,0,0,8.4583
6,0,1,54.0,0,0,51.8625
7,0,3,2.0,3,1,21.0750
...,...,...,...,...,...,...
883,0,2,28.0,0,0,10.5000
884,0,3,25.0,0,0,7.0500
886,0,2,27.0,0,0,13.0000
889,1,1,26.0,0,0,30.0000


## Fitering datadrames with multiple conditions (AND)

### Checking the hypothesis of whether being an adult male on the titanic was bad for your survival

### Conclusion: True

In [21]:
import pandas as pd

In [22]:
titanic = pd.read_csv("titanic.csv")

In [23]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [24]:
mask1 = titanic.sex == 'male'
mask1.head()

0     True
1    False
2    False
3    False
4     True
Name: sex, dtype: bool

In [25]:
mask2 = titanic.age > 14
mask2.head()

0    True
1    True
2    True
3    True
4    True
Name: age, dtype: bool

In [26]:
(mask1 & mask2).head()

0     True
1    False
2    False
3    False
4     True
dtype: bool

In [29]:
male_surv = titanic.loc[mask1 & mask2, ['survived', 'pclass', 'sex', 'age']]
male_surv.head(20)

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
4,0,3,male,35.0
6,0,1,male,54.0
12,0,3,male,20.0
13,0,3,male,39.0
20,0,2,male,35.0
21,1,2,male,34.0
23,1,1,male,28.0
27,0,1,male,19.0
30,0,1,male,40.0


In [30]:
male_surv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 414 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  414 non-null    int64  
 1   pclass    414 non-null    int64  
 2   sex       414 non-null    object 
 3   age       414 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 16.2+ KB


In [33]:
male_surv.describe()  #only 17% of adult males survived

Unnamed: 0,survived,pclass,age
count,414.0,414.0,414.0
mean,0.173913,2.309179,33.129227
std,0.379493,0.829868,12.922177
min,0.0,1.0,15.0
25%,0.0,2.0,23.0
50%,0.0,3.0,30.0
75%,0.0,3.0,40.0
max,1.0,3.0,80.0


In [34]:
titanic.describe() # in total 38% survived

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Fitering datadrames with multiple conditions (AND)

### Checking whether you being a woman or a child helped in survivability

### Conclusion: True

In [36]:
import pandas as pd

In [37]:
titanic = pd.read_csv("titanic.csv")

In [38]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [39]:
mask1 = titanic.sex == 'female'

In [40]:
mask2 = titanic.age < 14

In [42]:
wom_or_chil = titanic.loc[mask1 | mask2, ['survived', 'pclass', 'sex', 'age' ]]

In [44]:
wom_or_chil.head()

Unnamed: 0,survived,pclass,sex,age
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
7,0,3,male,2.0
8,1,3,female,27.0


In [47]:
wom_or_chil.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 1 to 888
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  351 non-null    int64  
 1   pclass    351 non-null    int64  
 2   sex       351 non-null    object 
 3   age       298 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 13.7+ KB


In [49]:
wom_or_chil.describe()  # 72% women or children survived

Unnamed: 0,survived,pclass,age
count,351.0,351.0,298.0
mean,0.723647,2.205128,25.039161
std,0.447832,0.847232,15.314631
min,0.0,1.0,0.42
25%,0.0,1.0,14.125
50%,1.0,2.0,24.0
75%,1.0,3.0,35.0
max,1.0,3.0,63.0
