# MACHINE LEARNING WORKSHOP

### Titanic Survival Prediction

In [None]:
# STEP 1: Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# STEP 2: Import dataset
file_path = 'https://raw.githubusercontent.com/shobith-s/ML_WORKSHOP/main/Titanic-Dataset.csv'
data = pd.read_csv(file_path)
print(f"Data shape: {data.shape}")
data.head()

In [None]:
# STEP 3: Explore data
print("Missing values in each column:")
print(data.isnull().sum())

print("\nData info:")
print(data.info())

print("\nData description:")
print(data.describe())

In [None]:
# Visualize Survival by Sex
sns.countplot(x='Survived', data=data, hue='Sex')
plt.title('Survival Count by Sex')
plt.show()

In [None]:
# Visualize Survival by Passenger Class
sns.countplot(x='Survived', data=data, hue='Pclass')
plt.title('Survival Count by Passenger Class')
plt.show()

In [None]:
# STEP 4: Data Cleaning
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data.drop('Cabin', axis=1, inplace=True)  # Dropping 'Cabin' due to many missing values

# Confirm no missing values remain
print("Missing values after cleaning:")
print(data.isnull().sum())

In [None]:
# STEP 5: Feature Engineering
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)  # Drop irrelevant columns

data.head()

In [None]:
# STEP 6: Train-Test Split
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

In [None]:
# STEP 7: Model Training
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
# STEP 8: Model Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# STEP 9: Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

importances.plot(kind='bar')
plt.title('Feature Importances')
plt.show()