In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/data_titanic.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Total number of passengers

In [3]:
print("Total no. of passengers: ", df['PassengerId'].count())

Total no. of passengers:  891


## Different columns present in dataset

In [4]:
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


## Datatype of each column

In [5]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## How many datapoints are missing in each columns

In [6]:
# For each particular column
print(df['PassengerId'].isnull().sum())
print(df['Cabin'].isnull().sum())

0
687


In [7]:
# For all columns
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

We could fill these values by using either mean, median or mode of all or certain range of values above and below the missing value.

## Maximum and Minimum age of passengers

In [8]:
print("Maximum age: ",df['Age'].max())
print("Minimum age: ",df['Age'].min())

Maximum age:  80.0
Minimum age:  0.42


In [9]:
min_age_rows = df[df['Age'] == 0.42]

# View the data with the minimum age
print("Data with the minimum age:")
print(min_age_rows)

Data with the minimum age:
     PassengerId  Survived  Pclass                             Name   Sex  \
803          804         1       3  Thomas, Master. Assad Alexander  male   

      Age  SibSp  Parch Ticket    Fare Cabin Embarked  
803  0.42      0      1   2625  8.5167   NaN        C  


## Minimum, Maximum and Average Fare

In [10]:
print("Maximum fare: ",df['Fare'].max())
print("Minimum fare: ",df['Fare'].min())
print("Average fare: ",df['Fare'].mean())

Maximum fare:  512.3292
Minimum fare:  0.0
Average fare:  32.204207968574636


## Approximate year in which the passengers were born
We obtain the approx. year by subtracting Age from Titanic Accident Year.
i.e, Titanic Accident Year (1912) - Age

In [11]:
approximate_year_born = 1912 - df['Age']
print(approximate_year_born)

0      1890.0
1      1874.0
2      1886.0
3      1877.0
4      1877.0
        ...  
886    1885.0
887    1893.0
888       NaN
889    1886.0
890    1880.0
Name: Age, Length: 891, dtype: float64


## Percentage of passengers survived

In [12]:
survived_passengers_count = df['Survived'].tolist().count(1)
total_passengers_count = df['PassengerId'].count()
print("Survived passengers : ", survived_passengers_count)
print("Total passengers : ", total_passengers_count)
percentage_of_passengers_survived = (survived_passengers_count/total_passengers_count)*100
print("Percentage of passengers survived : ", percentage_of_passengers_survived, "%")

Survived passengers :  342
Total passengers :  891
Percentage of passengers survived :  38.38383838383838 %


## How many male and female on board

In [13]:
print(f"No. of male : {df['Sex'].tolist().count('male')}")
print(f"No. of female : {df['Sex'].tolist().count('female')}")

No. of male : 577
No. of female : 314


## No. of passengers in each class

In [14]:
print(f"Passengers in each class:\n {df['Pclass'].value_counts()}")

Passengers in each class:
 Pclass
3    491
1    216
2    184
Name: count, dtype: int64


## Proportion of male to female passenger

In [15]:
no_of_male = df['Sex'].tolist().count('male')
no_of_female = df['Sex'].tolist().count('female')
print(f"Ratio of male to female passengers: {no_of_male/no_of_female}")

Ratio of male to female passengers: 1.8375796178343948


## Proportion of male to female based on survival

In [16]:
no_of_male_survived = df[df['Survived'] == 1]['Sex'].tolist().count('male')
print("No. of male survived: ",no_of_male_survived)
no_of_female_survived = df[df['Survived'] == 1]['Sex'].tolist().count('female')
print("No. of male survived: ",no_of_female_survived)
print("Ratio of male to female survived: ",no_of_male_survived/no_of_female_survived)

No. of male survived:  109
No. of male survived:  233
Ratio of male to female survived:  0.4678111587982833


## Chance of survival based on:

### Sex

In [17]:
# Calculate the total number of passengers by sex
passenger_count_by_sex = df['Sex'].value_counts()

# Calculate the number of survivors by sex
survivor_count_by_sex = df.groupby('Sex')['Survived'].sum()

# Calculate the chance of survival by sex
survival_chance_by_sex = (survivor_count_by_sex / passenger_count_by_sex) * 100

print("Chance of survival based on sex:")
print(survival_chance_by_sex)

Chance of survival based on sex:
Sex
female    74.203822
male      18.890815
dtype: float64


### Age Group

In [18]:
# Define age groups
age_groups = pd.cut(df['Age'], bins=[0, 16, 50, float('inf')], labels=['<16', '16-50', '50>'])

# Filter the dataset based on age groups and calculate the total number of passengers in each age group
passenger_count_by_age_group = df.groupby(age_groups).size()

# Filter the dataset to include only survivors and calculate the number of survivors in each age group
survivor_count_by_age_group = df[df['Survived'] == 1].groupby(age_groups).size()

# Calculate the chance of survival for each age group
survival_chance_by_age_group = (survivor_count_by_age_group / passenger_count_by_age_group) * 100

print("Chance of survival based on age group:")
print(survival_chance_by_age_group)

Chance of survival based on age group:
Age
<16      55.000000
16-50    38.727273
50>      34.375000
dtype: float64


### Passenger Class

In [19]:
passenger_count_by_class = df.groupby('Pclass').size()
print("Passenger count by class: ")
print(passenger_count_by_class)
passenger_survived_count_by_class = df[df['Survived'] == 1].groupby('Pclass').size()
print("Passenger Survived count by class: ")
print(passenger_survived_count_by_class)
print("Survival chance based on Passenger Class: ")
print((passenger_survived_count_by_class/passenger_count_by_class)*100)

Passenger count by class: 
Pclass
1    216
2    184
3    491
dtype: int64
Passenger Survived count by class: 
Pclass
1    136
2     87
3    119
dtype: int64
Survival chance based on Passenger Class: 
Pclass
1    62.962963
2    47.282609
3    24.236253
dtype: float64


### Combination of above

In [20]:
# Define the age groups
age_bins = [0, 16, 50, float('inf')]  # Define the age group boundaries
age_labels = ['<16', '16-50', '50>']   # Define the age group labels

# Bin the ages into age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Group the DataFrame by 'AgeGroup', 'Sex', and 'Pclass', and calculate the count of passengers in each group
passenger_count_by_group = df.groupby(['AgeGroup', 'Sex', 'Pclass']).size().reset_index(name='PassengerCount')

# Group the DataFrame by 'AgeGroup', 'Sex', and 'Pclass', filter for survived passengers, and calculate the count of survivors in each group
passenger_survived_count_by_group = df[df['Survived'] == 1].groupby(['AgeGroup', 'Sex', 'Pclass']).size().reset_index(name='SurvivedCount')

# Merge the two DataFrames on 'AgeGroup', 'Sex', and 'Pclass'
combined_data = pd.merge(passenger_count_by_group, passenger_survived_count_by_group, on=['AgeGroup', 'Sex', 'Pclass'], how='left')

# Calculate the survival chance for each group
combined_data['SurvivalChance'] = (combined_data['SurvivedCount'] / combined_data['PassengerCount']) * 100

# Print the result
print(combined_data)

   AgeGroup     Sex  Pclass  PassengerCount  SurvivedCount  SurvivalChance
0       <16  female       1               3              2       66.666667
1       <16  female       2              10             10      100.000000
2       <16  female       3              30             16       53.333333
3       <16    male       1               3              3      100.000000
4       <16    male       2               9              9      100.000000
5       <16    male       3              28              9       32.142857
6     16-50  female       1              67             66       98.507463
7     16-50  female       2              58             53       91.379310
8     16-50  female       3              71             30       42.253521
9     16-50    male       1              69             31       44.927536
10    16-50    male       2              77              5        6.493506
11    16-50    male       3             215             29       13.488372
12      50>  female      

### Age Distribution per ticket

In [21]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,16-50
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,16-50
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,16-50
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,16-50
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,16-50


In [22]:
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
684,685,0,2,"Brown, Mr. Thomas William Solomon",male,60.0,1,1,29750,39.0,,S,50>
843,844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C,16-50
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C,


In [23]:
pd.pivot_table(
    df,
    values='PassengerId', # count by PassengerId
    index='Pclass',
    columns='AgeGroup',
    aggfunc = 'count',
    fill_value = 0,
)

AgeGroup,<16,16-50,50>
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6,136,44
2,19,135,19
3,58,286,11


### Age Distribution per sex

In [24]:
pd.pivot_table(
    df,
    values = 'PassengerId',
    index = 'Sex',
    columns = 'AgeGroup',
    aggfunc = 'count',
    fill_value = 0,
)

AgeGroup,<16,16-50,50>
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,43,196,22
male,40,361,52


### Distribution of ticket price by class

In [25]:
df['Fare'].max()

512.3292

In [26]:
df['Fare'].min()

0.0

In [27]:
# This gives way too many columns, as you can see below.
pd.pivot_table(
    df,
    values = 'PassengerId',
    index = 'Pclass',
    columns = 'Fare',
    aggfunc = 'count',
    fill_value = 0,
)

Fare,0.0000,4.0125,5.0000,6.2375,6.4375,6.4500,6.4958,6.7500,6.8583,6.9500,...,153.4625,164.8667,211.3375,211.5000,221.7792,227.5250,247.5208,262.3750,263.0000,512.3292
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,1,0,0,0,0,0,0,0,...,3,2,3,1,1,4,2,2,4,3
2,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,1,1,1,2,2,1,1,...,0,0,0,0,0,0,0,0,0,0


### So, we have to create groups of 'Fare' as we did with AgeGroups above.

In [30]:
fareBins = [0,170,340,513]
fareLabels = ['0-170', '170-340', '340-513']
# When right=False, the intervals will be left-inclusive and right-exclusive.
df['FareGroup'] = pd.cut(df['Fare'], bins=fareBins, labels=fareLabels, right = False)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,16-50,0-170
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,16-50,0-170
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,16-50,0-170
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,16-50,0-170
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,16-50,0-170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,16-50,0-170
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,16-50,0-170
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,,0-170
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,16-50,0-170


In [31]:
pd.pivot_table(
    df,
    values = 'PassengerId',
    index = 'Pclass',
    columns = 'FareGroup',
    aggfunc = 'count',
    fill_value = 0,
)

FareGroup,0-170,170-340,340-513
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,196,17,3
2,184,0,0
3,491,0,0


## Curiosity

### Ticket class most of the largest families get

In [53]:
"""
df[(df['SibSp']+df['Parch']) == (df['SibSp']+df['Parch']).max()] gives dataframe of largest families. 

"""
dataframe_containing_largest_family_size = df[(df['SibSp']+df['Parch']) == (df['SibSp']+df['Parch']).max()]
print(dataframe_containing_largest_family_size)
mode_of_Pclass = dataframe_containing_largest_family_size['Pclass'].mode() # Returns Pandas Series
ticket_class_of_the_largest_families = mode_of_Pclass.iloc[-1] # Get last element from Pandas Series
print("Most of the largest families get Ticket of class: ",ticket_class_of_the_largest_families)

     PassengerId  Survived  Pclass                               Name     Sex  \
159          160         0       3         Sage, Master. Thomas Henry    male   
180          181         0       3       Sage, Miss. Constance Gladys  female   
201          202         0       3                Sage, Mr. Frederick    male   
324          325         0       3           Sage, Mr. George John Jr    male   
792          793         0       3            Sage, Miss. Stella Anna  female   
846          847         0       3           Sage, Mr. Douglas Bullen    male   
863          864         0       3  Sage, Miss. Dorothy Edith "Dolly"  female   

     Age  SibSp  Parch    Ticket   Fare Cabin Embarked AgeGroup FareGroup  
159  NaN      8      2  CA. 2343  69.55   NaN        S      NaN     0-170  
180  NaN      8      2  CA. 2343  69.55   NaN        S      NaN     0-170  
201  NaN      8      2  CA. 2343  69.55   NaN        S      NaN     0-170  
324  NaN      8      2  CA. 2343  69.55   NaN  

In [62]:
dataframe_of_female_passengers = df[df['Sex'] == 'female']
dataframe_of_solo_female_passengers = dataframe_of_female_passengers[(dataframe_of_female_passengers['SibSp'] == 0) & (dataframe_of_female_passengers['Parch'] == 0)]
dataframe_of_solo_female_passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,16-50,0-170
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,50>,0-170
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S,<16,0-170
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S,50>,0-170
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,,0-170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S,16-50,0-170
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S,16-50,0-170
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C,<16,0-170
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,16-50,0-170


In [72]:
print("Lowest proportion Ticket class:")
dataframe_of_solo_female_passengers['Pclass'].value_counts().idxmin()

Lowest proportion Ticket class:


2

## Oldest and Youngest Passenger

In [75]:
# Yougest Passenger
dataframe_of_youngest_passenger = df[df['Age'] == df['Age'].min()]
dataframe_of_youngest_passenger

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C,<16,0-170


In [76]:
# Oldest Passenger
dataframe_of_oldest_passenger = df[df['Age'] == df['Age'].max()]
dataframe_of_oldest_passenger

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S,50>,0-170


## Oldest and Youngest to Survive

In [84]:
# Youngest Passenger to Survive
dataframe_of_survived = df[df['Survived'] == 1]
dataframe_of_survived[dataframe_of_survived['Age'] == dataframe_of_survived['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C,<16,0-170


In [85]:
# Oldest Passenger to Survive
dataframe_of_survived = df[df['Survived'] == 1]
dataframe_of_survived[dataframe_of_survived['Age'] == dataframe_of_survived['Age'].max()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S,50>,0-170


## Oldest and Youngest to Die

In [83]:
# Youngest Passenger to Die
dataframe_of_dead = df[(df['Survived'] == 0)]
dataframe_of_dead[dataframe_of_dead['Age'] == dataframe_of_dead['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
164,165,0,3,"Panula, Master. Eino Viljami",male,1.0,4,1,3101295,39.6875,,S,<16,0-170
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S,<16,0-170


In [86]:
# Oldest Passenger to Die
dataframe_of_dead = df[(df['Survived'] == 0)]
dataframe_of_dead[dataframe_of_dead['Age'] == dataframe_of_dead['Age'].max()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.775,,S,50>,0-170


## Oldest and Youngest by Sex

In [87]:
# Oldest Male
dataframe_of_male = df[(df['Sex'] == 'male')]
dataframe_of_male[dataframe_of_male['Age'] == dataframe_of_male['Age'].max()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S,50>,0-170


In [88]:
# Oldest Female
dataframe_of_female = df[(df['Sex'] == 'female')]
dataframe_of_female[dataframe_of_female['Age'] == dataframe_of_female['Age'].max()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
275,276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,50>,0-170
483,484,1,3,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,,S,50>,0-170


In [89]:
# Youngest Male
dataframe_of_male = df[(df['Sex'] == 'male')]
dataframe_of_male[dataframe_of_male['Age'] == dataframe_of_male['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C,<16,0-170


In [93]:
# Youngest Female
dataframe_of_female = df[(df['Sex'] == 'female')]
dataframe_of_female[dataframe_of_female['Age'] == dataframe_of_female['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C,<16,0-170
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C,<16,0-170


In [94]:
# Youngest Female to Die
dataframe_of_female = df[(df['Sex'] == 'female')]
dataframe_of_dead_female = dataframe_of_female[dataframe_of_female['Survived'] == 0]
dataframe_of_dead_female[dataframe_of_dead_female['Age'] == dataframe_of_dead_female['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
119,120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2.0,4,2,347082,31.275,,S,<16,0-170
205,206,0,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,347054,10.4625,G6,S,<16,0-170
297,298,0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,<16,0-170
642,643,0,3,"Skoog, Miss. Margit Elizabeth",female,2.0,3,2,347088,27.9,,S,<16,0-170


In [95]:
# Youngest Female to Survive
dataframe_of_female = df[(df['Sex'] == 'female')]
dataframe_of_dead_female = dataframe_of_female[dataframe_of_female['Survived'] == 1]
dataframe_of_dead_female[dataframe_of_dead_female['Age'] == dataframe_of_dead_female['Age'].min()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,FareGroup
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C,<16,0-170
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C,<16,0-170


### How Many Doctors?

In [99]:
# Filter the dataset for passengers with the title "Dr."
doctors_on_board = df[df['Name'].str.contains('Dr.')]

print(f"Number of doctors on board: {doctors_on_board['PassengerId'].count()}")

Number of doctors on board: 11
