In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Load dataset
df = pd.read_csv("/content/titanic.csv")


In [3]:
# Display basic info
print(df.head())
print(df.isnull().sum())

# Drop unnecessary columns
df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

# Fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [11]:
# Encode categorical variables
le_sex = LabelEncoder()
df['Sex'] = le_sex.fit_transform(df['Sex'])  # male=1, female=0

le_embarked = LabelEncoder()
df['Embarked'] = le_embarked.fit_transform(df['Embarked'])  # S=2, C=0, Q=1

In [6]:
# Define features and target
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Check for missing values in X
print("\nMissing values in X after preprocessing:")
print(X.isnull().sum())

# Drop rows with any missing values in X
X = X.dropna()
y = y[X.index]  # Ensure y aligns with the dropped rows in X

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Missing values in X after preprocessing:
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        1
Embarked    0
dtype: int64
Accuracy: 1.0
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [7]:
from sklearn.model_selection import cross_val_score

# 5-fold cross-validation accuracy
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy: {:.2f}%".format(cv_scores.mean() * 100))


Cross-Validation Accuracy: 100.00%


In [30]:
# Step 5: Prediction Function (Based on your data format)
def predict_passenger():

    pclass = int(input("Pclass (1/2/3): "))
    sex = input("Sex (male/female): ").strip().lower()
    age = float(input("Age (in years): "))
    sibsp = int(input("SibSp (siblings/spouse aboard): "))
    parch = int(input("Parch (parents/children aboard): "))
    fare = float(input("Fare (ticket price): "))
    embarked = input("Embarked (C/Q/S): ").strip().upper()

    # Prepare input in correct order
    input_data = pd.DataFrame([[pclass, sex, age, sibsp, parch, fare, embarked]],
                              columns=["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])

    # Encode input
    input_data["Sex"] = le_sex.transform(input_data["Sex"])
    input_data["Embarked"] = le_embarked.transform(input_data["Embarked"])

    # Prediction
    pred = model.predict(input_data)[0]
    print("\n🎯 Prediction:", "🟢 Survived" if pred == 1 else "🔴 Not Survived")

# ▶️ Call prediction system
predict_passenger()

Pclass (1/2/3): 3
Sex (male/female): 1
Age (in years): 34.5
SibSp (siblings/spouse aboard): 0
Parch (parents/children aboard): 0
Fare (ticket price): 7.8292
Embarked (C/Q/S): 1

🎯 Prediction: 🔴 Not Survived
