In [2]:
# Load the Titanic dataset
import pandas as pd
titanic_df = pd.read_csv('titanic.csv')

# Get the number of rows and columns
num_rows, num_columns = titanic_df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)


Number of rows: 891
Number of columns: 12


In [3]:
# Calculate the average age
average_age = titanic_df['Age'].mean()
print("Average Age:", average_age)


Average Age: 29.69911764705882


In [4]:
# Count the number of survivors (Survived=1) and non-survivors (Survived=0)
survivors = titanic_df['Survived'].sum()
non_survivors = num_rows - survivors
print("Survived:", survivors)
print("Did not survive:", non_survivors)


Survived: 342
Did not survive: 549


In [5]:
# Calculate the percentage of male and female passengers
gender_percentage = titanic_df['Sex'].value_counts(normalize=True) * 100
print(gender_percentage)


male      64.758698
female    35.241302
Name: Sex, dtype: float64


In [6]:
class_age_avg = titanic_df.groupby('Pclass')['Age'].mean()
print(class_age_avg)


Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64


In [7]:
class_fare_avg = titanic_df.groupby('Pclass')['Fare'].mean()
print(class_fare_avg)


Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64


In [8]:
survivors_by_port = titanic_df.groupby('Embarked')['Survived'].sum()
print(survivors_by_port)


Embarked
C     93
Q     30
S    217
Name: Survived, dtype: int64


In [9]:
oldest_passenger = titanic_df.loc[titanic_df['Age'].idxmax()]
youngest_passenger = titanic_df.loc[titanic_df['Age'].idxmin()]
print("Oldest Passenger:\n", oldest_passenger)
print("Youngest Passenger:\n", youngest_passenger)


Oldest Passenger:
 PassengerId                                     631
Survived                                          1
Pclass                                            1
Name           Barkworth, Mr. Algernon Henry Wilson
Sex                                            male
Age                                            80.0
SibSp                                             0
Parch                                             0
Ticket                                        27042
Fare                                           30.0
Cabin                                           A23
Embarked                                          S
Name: 630, dtype: object
Youngest Passenger:
 PassengerId                                804
Survived                                     1
Pclass                                       3
Name           Thomas, Master. Assad Alexander
Sex                                       male
Age                                       0.42
SibSp                        

In [10]:
with_sibsp = len(titanic_df[titanic_df['SibSp'] > 0])
with_parch = len(titanic_df[titanic_df['Parch'] > 0])
print("Passengers with SibSp:", with_sibsp)
print("Passengers with Parch:", with_parch)


Passengers with SibSp: 283
Passengers with Parch: 213


In [11]:
# Define age groups
age_bins = [0, 18, 60, 150]  # Child, Adult, Elderly
age_labels = ["Child", "Adult", "Elderly"]

# Create a new column 'AgeGroup' to categorize passengers
titanic_df['AgeGroup'] = pd.cut(titanic_df['Age'], bins=age_bins, labels=age_labels)

# Calculate survival rates for each age group
age_group_survival = titanic_df.groupby('AgeGroup')['Survived'].mean()
print(age_group_survival)


AgeGroup
Child      0.503597
Adult      0.388788
Elderly    0.227273
Name: Survived, dtype: float64


In [12]:
# Create a new column 'TravelAlone' to identify passengers traveling alone
titanic_df['TravelAlone'] = (titanic_df['SibSp'] + titanic_df['Parch']) == 0

# Calculate survival rates for passengers traveling alone and with family
survival_alone = titanic_df[titanic_df['TravelAlone']]['Survived'].mean()
survival_with_family = titanic_df[~titanic_df['TravelAlone']]['Survived'].mean()
print("Survival Rate for Traveling Alone:", survival_alone)
print("Survival Rate for Traveling with Family:", survival_with_family)


Survival Rate for Traveling Alone: 0.30353817504655495
Survival Rate for Traveling with Family: 0.5056497175141242


In [13]:
embark_class_survival = titanic_df.groupby(['Embarked', 'Pclass'])['Survived'].mean()
print(embark_class_survival)


Embarked  Pclass
C         1         0.694118
          2         0.529412
          3         0.378788
Q         1         0.500000
          2         0.666667
          3         0.375000
S         1         0.582677
          2         0.463415
          3         0.189802
Name: Survived, dtype: float64


In [14]:
age_fare_corr = titanic_df['Age'].corr(titanic_df['Fare'])
print("Correlation between Age and Fare:", age_fare_corr)


Correlation between Age and Fare: 0.0960666917690389


In [15]:
# Find the most common port of embarkation
most_common_embarkation_port = titanic_df['Embarked'].mode()[0]
print("Most Common Embarkation Port:", most_common_embarkation_port)


Most Common Embarkation Port: S


In [16]:
# Count passengers with more than one sibling or spouse
with_multiple_sibsp = len(titanic_df[titanic_df['SibSp'] > 1])
print("Passengers with SibSp > 1:", with_multiple_sibsp)


Passengers with SibSp > 1: 74


In [17]:
with_family_by_class = titanic_df[(titanic_df['SibSp'] > 0) | (titanic_df['Parch'] > 0)]['Pclass'].value_counts()
print(with_family_by_class)


3    167
1    107
2     80
Name: Pclass, dtype: int64
