# Titanic Survival Prediction using Logistic Regression

## 1. Load and Explore the Data

In [1]:

import pandas as pd

# Load the dataset
train_data = pd.read_csv('train.csv')

# Display the first few rows of the dataset
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

# Get summary statistics
print(train_data.describe())

# Display the data types of each column
print(train_data.info())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

## 2. Data Preprocessing

In [2]:

# Handle missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data.drop(columns=['Cabin'], inplace=True)

# Convert categorical variables into numerical format using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)

# Drop unnecessary columns
train_data.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

print(train_data.head())


   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
0         0       3  22.0      1      0   7.2500      True       False   
1         1       1  38.0      1      0  71.2833     False       False   
2         1       3  26.0      0      0   7.9250     False       False   
3         1       1  35.0      1      0  53.1000     False       False   
4         0       3  35.0      0      0   8.0500      True       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  


## 3. Define Features and Target Variable

In [3]:

# Define features and target variable
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']


## 4. Split the Data

In [4]:

from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 5. Build and Train the Model

In [5]:

from sklearn.linear_model import LogisticRegression

# Build the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)


## 6. Make Predictions

In [6]:

# Make predictions
y_pred = model.predict(X_test)


## 7. Evaluate the Model

In [7]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Generate a classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.81
Confusion Matrix:
[[90 15]
 [19 55]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



## 8. Interpret the Results

In [8]:

# Model coefficients
coefficients = pd.DataFrame(model.coef_[0], index=X.columns, columns=['Coefficient'])
print(coefficients)

# Discuss model performance and potential improvements


            Coefficient
Pclass        -0.937706
Age           -0.030593
SibSp         -0.295676
Parch         -0.109326
Fare           0.002588
Sex_male      -2.592511
Embarked_Q    -0.113704
Embarked_S    -0.399716
