In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the Titanic dataset
titanic_df = pd.read_csv('titanic.csv')

In [3]:
# Display the first few rows of the dataset to understand its structure
print(titanic_df.head())

# Data exploration: Check for missing values in each column
# This helps identify columns that need to be cleaned or filled
print(titanic_df.isnull().sum())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

In [4]:
# Drop irrelevant columns that don't contribute to the model
# PassengerId, Name, Ticket, and Cabin are dropped as they don't provide useful information for prediction
titanic_df = titanic_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Fill missing values in the 'Age' column with the median age of passengers
# This handles missing data by using a central tendency measure
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)

# Fill missing values in the 'Embarked' column with the mode (most frequent value)
# This ensures that all rows have valid data
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)

In [5]:
# Encode categorical variables (Sex and Embarked) using LabelEncoder
# Converts categorical text data into numerical data for the model
label_encoder = LabelEncoder()
titanic_df['Sex'] = label_encoder.fit_transform(titanic_df['Sex'])  # Male=1, Female=0
titanic_df['Embarked'] = label_encoder.fit_transform(titanic_df['Embarked'])  # C=0, Q=1, S=2

# Split the data into features (X) and target (y)
# X contains all columns except 'Survived', and y contains the 'Survived' column
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

In [6]:
# Split the data into training and testing sets
# 70% of the data is used for training, and 30% is used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the features using StandardScaler
# This scales the data to have a mean of 0 and a standard deviation of 1, which helps with model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [7]:
# Train the Logistic Regression model
# Logistic Regression is a linear model used for binary classification problems like this one
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
# The model uses the learned weights to predict whether passengers survived or not
y_pred = model.predict(X_test)

# Evaluate the model's performance
# Accuracy measures the percentage of correct predictions
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7947761194029851
