<a href="https://colab.research.google.com/github/rasiq-gulzar/Encryptix/blob/main/titanic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical operations
import seaborn as sns            # For statistical data visualization
import matplotlib.pyplot as plt  # For creating plots and visualizations
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.preprocessing import LabelEncoder         # To encode categorical variables into numbers
from sklearn.impute import SimpleImputer               # To handle missing values in the dataset
from sklearn.linear_model import LogisticRegression    # The classification algorithm we'll use
from sklearn.metrics import accuracy_score, classification_report  # For evaluating model performance

df=pd.read_csv('/content/Titanic-Dataset (1).csv')
# Select only the important features from the Titanic dataset
# Removing irrelevant columns like Name, Ticket number, Cabin, etc. that won't contribute much to prediction
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

# Handle missing values in the Age column by replacing them with the mean age
# This is necessary because machine learning models can't work with missing (NaN) values
imputer = SimpleImputer(strategy="mean")  # Create an imputer that replaces missing values with the mean
df['Age'] = imputer.fit_transform(df[['Age']])  # Apply the imputer to the Age column

# Convert categorical 'Sex' variable to numerical values (0 and 1)
# Machine learning models require numerical input, so text values like 'male' and 'female' must be encoded
encoder = LabelEncoder()  # Create a label encoder object
df['Sex'] = encoder.fit_transform(df['Sex'])  # Transform 'male' and 'female' to 1 and 0 respectively

# Split the data into features (X) and target variable (y)
X = df.drop("Survived", axis=1)  # Features: everything except the Survived column
y = df["Survived"]               # Target: the Survived column (what we want to predict)

# Split the data into training set (80%) and testing set (20%)
# This allows us to train the model and then test it on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# random_state=0 ensures reproducible results (same split every time the code runs)

# Create and train the logistic regression model
# Logistic regression is well-suited for binary classification problems like survival prediction
model = LogisticRegression()  # Initialize the logistic regression model
model.fit(X_train, y_train)   # Train the model using the training data

# Use the trained model to predict survival for the test set
y_pred = model.predict(X_test)  # Make predictions on the test data

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)  # Calculate the percentage of correct predictions
print(f"Accuracy: {accuracy:.2f}")          # Print the accuracy with 2 decimal places

# Print a detailed classification report
# This shows precision, recall, f1-score and support for each class (survived or not)
print(classification_report(y_test, y_pred))

# Create example passenger data to demonstrate how to use the model for new predictions
sample_data = pd.DataFrame({
    'Pclass': [3, 1, 3, 1, 2],    # Ticket class (1=1st class, 2=2nd class, 3=3rd class)
    'Sex': [1, 0, 0, 1, 0],       # Gender encoded (Male=1, Female=0)
    'Age': [22, 38, 26, 35, 28],  # Age of passengers in years
    'SibSp': [1, 1, 0, 1, 0],     # Number of siblings/spouses aboard
    'Parch': [0, 0, 0, 1, 0],     # Number of parents/children aboard
    'Fare': [7.25, 71.28, 8.05, 53.1, 13]  # Ticket fare in pounds
})

# Predict survival for the sample passengers
sample_predictions = model.predict(sample_data)
print("Predictions for sample input (1=Survived, 0=Not Survived):", sample_predictions)
# This shows survival predictions for each passenger in the sample data

Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.71      0.73        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Predictions for sample input (1=Survived, 0=Not Survived): [0 1 1 0 1]
