In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


## LOAD THE DATA

In [None]:
df = pd.read_csv('/kaggle/input/titanic-dataset/Titanic-Dataset.csv')
df.head()

In [None]:
df.describe()
print(df.shape)

In [None]:
df.info()

In [None]:
df.columns

### Features Used in Titanic Survival Prediction

- **PassengerId**: Unique identifier for each passenger
- **Pclass**: Ticket class (1st, 2nd, or 3rd)
- **Name**: Passenger's name
- **Sex**: Passenger's gender (male or female)
- **Age**: Passenger's age in years 
- **SibSp**: Number of siblings or spouses aboard the Titanic
- **Parch**: Number of parents or children aboard the Titanic
- **Ticket**: Ticket number
- **Fare**: Ticket fare
- **Cabin**: Cabin number
- **Embarked**: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

## DATA PREPROCESSING

In [None]:
df.isnull().sum()

In [None]:
# Fill missing 'Age' with the median age
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' with the mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column as it has too many missing values
df.drop(columns=['Cabin'], inplace=True)

In [None]:
# Encode 'Sex' column
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode 'Embarked' column
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [None]:
df.head(10)

## FEATURE SELECTION

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S']
X = df[features]
y = df['Survived']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


### We used a Random Forest in our Titanic mystery to help us predict who survived based on all the clues (features) we had about each passenger. It's like having a team of detectives with different skills working together to crack the case!

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the predictions
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)

# Sort and plot feature importances
feature_importances.sort_values().plot(kind='barh')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()


### Based on my observations:

* Fare: The fare paid by the passenger was the most influential feature in predicting survival.
* Sex: Gender (male or female) was the second most important feature.
* Age: Age of the passenger also played a significant role.
* SibSp: The number of siblings/spouses aboard the Titanic had lesser importance compared to other features.

In [None]:
# Histogram of Age distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'].dropna(), bins=30, kde=True, color='blue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

### We can see that most of the passengers are youngsters in between the age group of 28-30 years.

In [None]:
df_age_valid = df.dropna(subset=['Age'])
plt.figure(figsize=(10, 6))
sns.histplot(df_age_valid['Age'], bins=30, kde=True, color='blue')

plt.title('Distribution of Passengers by Age')
plt.xlabel('Age')
plt.ylabel('Count')

# vertical lines for age group observation
plt.axvline(x=28, color='red', linestyle='--', linewidth=2, label='Age 28')
plt.axvline(x=30, color='green', linestyle='--', linewidth=2, label='Age 30')

plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
df['Sex'] = df['Sex'].replace({0: 'Male', 1: 'Female'})
sns.countplot(x='Sex', hue='Survived', data=df, palette='Set1')
plt.title('Survival by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(['Did not survive', 'Survived'])
plt.show()


### This shows that during the Titanic Incident women had higher survival rate than men.

In [None]:
# Box plot of Fare by Passenger class
plt.figure(figsize=(8, 6))
sns.boxplot(x='Pclass', y='Fare', data=df)
plt.title('Fare Distribution by Passenger Class')
plt.xlabel('Pclass')
plt.ylabel('Fare')
plt.show()

#### The narrow width suggests that there is less variability in fares within Pclass 3. Most fares within this class are closer to the median fare, with fewer outliers or extreme values.
#### Possible Reasons:

#### Pclass 3 typically represents lower socioeconomic status passengers on the Titanic, who likely purchased tickets within a narrower price range. 
#### This could reflect a more uniform pricing strategy or a smaller range of accommodations available within this class.


ps: the oulier in the pclass was given during cross validation

In [None]:
import joblib
joblib.dump(model, 'best_model.pkl')

print("Model trained and saved as 'best_model.pkl'.")


**Interactive Prediction:**** This part of the script defines functions for preprocessing input data (preprocess_input()) and interactively predicting survival (predict_survival_interactive()). 
It asks the user to enter values for each attribute (Pclass, Sex, Age, etc.) then preprocesses the input data, and predicts whether the passenger survived based on the loaded model (best_model).


In [None]:
import pandas as pd
import joblib

# Function to preprocess input data
def preprocess_input(new_data):
#Intialising placeholder for actual preprocessing steps
    processed_data = new_data
    return processed_data

# Function to interactively predict survival rate
def predict_survival_interactive(model):
    print("Please provide the following information:")
    
    # User INput
    Pclass = int(input("Passenger class (1, 2, 3): "))
    Sex = input("Sex (male or female): ")
    Age = float(input("Age (years): "))
    SibSp = int(input("Number of siblings/spouses aboard: "))
    Parch = int(input("Number of parents/children aboard: "))
    Fare = float(input("Fare: "))
    Embarked = input("Port of embarkation (C, Q, S): ")
    
    # One-hot encode 'Sex' input
    sex_encoded = 1 if Sex == 'male' else 0
    
    # Define Embarked_Q and Embarked_S based on user input
    Embarked_Q = 1 if Embarked == 'Q' else 0
    Embarked_S = 1 if Embarked == 'S' else 0
    
    # Create a DataFrame with the input data
    new_passenger_data = pd.DataFrame({
        'Pclass': [Pclass],
        'Sex': [sex_encoded],  # One-hot encoded 'Sex'
        'Age': [Age],
        'SibSp': [SibSp],
        'Parch': [Parch],
        'Fare': [Fare],
        'Embarked_Q': [Embarked_Q],
        'Embarked_S': [Embarked_S]
    })
    
    # Preprocess the input data
    X_new = preprocess_input(new_passenger_data)
    
    # Make predictions
    predictions = model.predict(X_new)
    
    # Print the prediction
    if predictions[0] == 1:
        print("Prediction: The passenger likely survived.")
    else:
        print("Prediction: The passenger likely did not survive.")

# Load the trained model
best_model = joblib.load('best_model.pkl')

# Example usage: interactive prediction
predict_survival_interactive(best_model)


In [None]:
df.columns