In [4]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from scipy import stats

df = sns.load_dataset('titanic')

df.head()



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# 1. Handling Missing Data
# Fill missing values in the 'age' column with the mean
df['age'].fillna(df['age'].mean(), inplace=True)

# Fill missing 'embarked' values with the most common value (mode)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Drop rows where 'fare' is missing, for simplicity
df.dropna(subset=['fare'], inplace=True)

print("\nData after filling missing values:")
print(df.isnull().sum())  # Check missing values

# 2. Replacing Values - Convert 'sex' column into numeric values (Male: 0, Female: 1)
df['sex'] = df['sex'].replace({'male': 0, 'female': 1})

print("\nData after replacing 'sex' values (Male: 0, Female: 1):")
print(df[['sex']].head())

# 3. Removing Duplicates - Check and drop duplicate rows if any
df.drop_duplicates(inplace=True)
print("\nData after removing duplicates:")
print(df.duplicated().sum())  # Check for duplicates

# 4. Detecting and Removing Outliers using Z-score method on 'fare' column
z_scores = np.abs(stats.zscore(df['fare']))
df = df[(z_scores < 3)]  # Keeping only rows where Z-score is less than 3

print("\nData after removing outliers:")
print(df.shape)

# 5. Prepare Data for Decision Tree
# Selecting relevant columns for features (X) and target (y)
X = df[['pclass', 'age', 'fare', 'sex']]  # Features
y = df['survived']  # Target (Survival: 1 = Survived, 0 = Did not survive)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train a Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test)

# 6. Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")



Data after filling missing values:
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Data after replacing 'sex' values (Male: 0, Female: 1):
   sex
0    0
1    1
2    1
3    1
4    0

Data after removing duplicates:
0

Data after removing outliers:
(764, 15)

Accuracy: 73.48%
Precision: 68.97%
Recall: 63.83%
