# Titanic Survival Prediction Project


In [3]:
import pandas as pd
import numpy as np

titanic_data = pd.read_csv('C:/Users/Dell/Downloads/data science/DS_projects_codsoft/titanic-sunday.csv')
print(titanic_data.head())


   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [6]:
# Check for missing values
print(titanic_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [9]:
# Fill missing 'Fare' values with the median fare
titanic_data['Fare'].fillna(titanic_data['Fare'].median(), inplace=True)

# Drop rows with missing 'Embarked' values
titanic_data.dropna(subset=['Embarked'], inplace=True)

print(titanic_data)


     PassengerId  Survived  Pclass  \
0            892         0       3   
1            893         1       3   
2            894         0       2   
3            895         0       3   
4            896         1       3   
..           ...       ...     ...   
413         1305         0       3   
414         1306         1       1   
415         1307         0       3   
416         1308         0       3   
417         1309         0       3   

                                             Name     Sex   Age  SibSp  Parch  \
0                                Kelly, Mr. James    male  34.5      0      0   
1                Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                       Myles, Mr. Thomas Francis    male  62.0      0      0   
3                                Wirz, Mr. Albert    male  27.0      0      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   
..                                            ...     ...

In [11]:
# Drop rows with missing 'Embarked' values
titanic_data.dropna(subset=['Embarked'], inplace=True)
titanic_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [12]:
# Convert 'Sex' to numeric: male = 1, female = 0
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 1, 'female': 0})

# Convert 'Embarked' to numeric: S = 0, C = 1, Q = 2
titanic_data['Embarked'] = titanic_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Drop unnecessary columns
titanic_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
print(titanic_data)

     Survived  Pclass  Sex   Age  SibSp  Parch      Fare  Embarked
0           0       3    1  34.5      0      0    7.8292         2
1           1       3    0  47.0      1      0    7.0000         0
2           0       2    1  62.0      0      0    9.6875         2
3           0       3    1  27.0      0      0    8.6625         0
4           1       3    0  22.0      1      1   12.2875         0
..        ...     ...  ...   ...    ...    ...       ...       ...
413         0       3    1  27.0      0      0    8.0500         0
414         1       1    0  39.0      0      0  108.9000         1
415         0       3    1  38.5      0      0    7.2500         0
416         0       3    1  27.0      0      0    8.0500         0
417         0       3    1  27.0      1      1   22.3583         1

[418 rows x 8 columns]


In [19]:
def simple_prediction(row):
    if row['Sex'] == 0:  # Predict survival for females
        return 1
    elif row['Pclass'] == 1:  # Predict survival for 1st class passengers
        return 1
    else:
        return 0  # Predict non-survival for others

In [25]:
titanic_data['Prediction'] = titanic_data.apply(simple_prediction, axis=1)

accuracy = np.mean(titanic_data['Prediction'] == titanic_data['Survived']) * 100
print(f"\nSimple Prediction Model Accuracy: {accuracy:.2f}%")


Simple Prediction Model Accuracy: 86.36%


In [23]:
print("\nSurvival Rate by Gender:")
print(titanic_data.groupby('Sex')['Survived'].mean())

print("\nSurvival Rate by Class:")
print(titanic_data.groupby('Pclass')['Survived'].mean())


Survival Rate by Gender:
Sex
0    1.0
1    0.0
Name: Survived, dtype: float64

Survival Rate by Class:
Pclass
1    0.467290
2    0.322581
3    0.330275
Name: Survived, dtype: float64


In [24]:
overall_survival_rate = np.mean(titanic_data['Survived']) * 100
print(f"\nOverall Survival Rate: {overall_survival_rate:.2f}%")

survivors_age = titanic_data[titanic_data['Survived'] == 1]['Age'].mean()
non_survivors_age = titanic_data[titanic_data['Survived'] == 0]['Age'].mean()

print(f"\nAverage Age of Survivors: {survivors_age:.2f}")
print(f"Average Age of Non-Survivors: {non_survivors_age:.2f}")


Overall Survival Rate: 36.36%

Average Age of Survivors: 29.73
Average Age of Non-Survivors: 29.52
