<a href="https://colab.research.google.com/github/sohaib-khan0/cs351-ai-lab-2022551/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
url = 'https://www.kaggle.com/c/titanic/data'
titanic_data = pd.read_csv('train.csv')

# Display the first few rows of the dataset
print(titanic_data.head())

# Visualize the distribution of key features

# 1. Ticket class distribution
plt.figure(figsize=(10, 5))
sns.countplot(x='Pclass', hue='Survived', data=titanic_data)
plt.title('Survival Count by Ticket Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.legend(title='Survived', loc='upper right', labels=['No', 'Yes'])
plt.show()

# 2. Age distribution
plt.figure(figsize=(10, 5))
sns.histplot(titanic_data['Age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# 3. Sex distribution
plt.figure(figsize=(10, 5))
sns.countplot(x='Sex', hue='Survived', data=titanic_data)
plt.title('Survival Count by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Survived', loc='upper right', labels=['No', 'Yes'])
plt.show()

# Check for missing values
print(titanic_data.isnull().sum())

# Check for outliers (e.g., Age)
plt.figure(figsize=(10, 5))
sns.boxplot(x=titanic_data['Age'])
plt.title('Boxplot of Age')
plt.show()

# Data Preprocessing

# Handle missing values
# Fill missing Age with median
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)

# Fill missing Embarked with mode
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Encode categorical variables
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data['Embarked'] = titanic_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Standardize numerical features
scaler = StandardScaler()
titanic_data[['Age', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Fare']])

# Prepare data for modeling
X = titanic_data[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
y = titanic_data['Survived']

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Part 2: Implementing k-NN and Decision Trees

# k-NN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Decision Tree Model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
y_pred_dtree = dtree.predict(X_test)

# Model Evaluation
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(classification_report(y_true, y_pred))

# Evaluate both models
evaluate_model(y_test, y_pred_knn, "k-NN")
evaluate_model(y_test, y_pred_dtree, "Decision Tree")

# Part 3: Visualization

# 1. Decision Boundaries Visualization
def plot_decision_boundaries(X, y, model, title):
    plt.figure(figsize=(10, 6))
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors='k', marker='o')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

# Using 'Pclass' and 'Sex' for decision boundaries (for visualization)
X_viz = X[['Pclass', 'Sex']]

# Plot decision boundaries for k-NN
plot_decision_boundaries(X_viz, y, knn, 'k-NN Decision Boundaries')

# Plot decision boundaries for Decision Tree
plot_decision_boundaries(X_viz, y, dtree, 'Decision Tree Decision Boundaries')

# 2. Performance Visualization
metrics = {
    'Model': ['k-NN', 'Decision Tree'],
    'Accuracy': [accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_dtree)],
    'Precision': [precision_score(y_test, y_pred_knn), precision_score(y_test, y_pred_dtree)],
    'Recall': [recall_score(y_test, y_pred_knn), recall_score(y_test, y_pred_dtree)],
    'F1 Score': [f1_score(y_test, y_pred_knn), f1_score(y_test, y_pred_dtree)],
}

metrics_df = pd.DataFrame(metrics)
metrics_df.set_index('Model', inplace=True)

# Plotting the performance metrics
metrics_df.plot(kind='bar', figsize=(10, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(axis='y')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'