<div style="background-color: #add8e6; padding: 10px; height: 70px; border-radius: 15px;">
    <div style="font-family: 'Georgia', serif; font-size: 20px; padding: 10px; text-align: right; position: absolute; right: 20px;">
        Mohammad Idrees Bhat <br>
        <span style="font-family: 'Arial', sans-serif;font-size: 12px; color: #0a0a0a;">Tech Skills Trainer | AI/ML Consultant</span> <!--- Mohammad Idrees Bhat | Tech Skills Trainer | AI/ML Consultant --->
    </div>
</div>

<!--- Mohammad Idrees Bhat | Tech Skills Trainer | AI/ML Consultant --->

<div style="background-color: #002147; padding: 10px; text-align: center; color: white; font-size: 32px; font-family: 'Arial', sans-serif;">
   Supervised Learning Algorithms comparisons <br>
    <h3 style="text-align: center; color: white; font-size: 15px; font-family: 'Arial', sans-serif;">Classification algorithms</h3>
</div>

### Applying Classification Algorithms to the Titanic Dataset 

In [59]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import seaborn as sns  # For loading datasets

# Load the Titanic dataset from seaborn
data = sns.load_dataset('titanic')

In [60]:
# Display the first few rows of the dataset to understand its structure
print("Dataset Preview:")
print(data.head())

Dataset Preview:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


## Data Preparation

In [61]:
# Drop columns that won't be used for prediction to simplify the dataset
data = data.drop(['embarked', 'class', 'who', 'adult_male', 'deck', 'alive', 'alone'], axis=1)


In [62]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town
0,0,3,male,22.0,1,0,7.25,Southampton
1,1,1,female,38.0,1,0,71.2833,Cherbourg
2,1,3,female,26.0,0,0,7.925,Southampton
3,1,1,female,35.0,1,0,53.1,Southampton
4,0,3,male,35.0,0,0,8.05,Southampton


#### Data Cleaning

In [63]:
# Handle missing values in 'age' and 'fare' columns by replacing them with the column mean
from sklearn.impute import SimpleImputer

In [64]:
# Impute missing values in 'age' and 'fare' columns separately

# Impute missing values in the 'age' column
age_imputer = SimpleImputer(strategy='mean')
data['age'] = age_imputer.fit_transform(data[['age']])

# Impute missing values in the 'fare' column
fare_imputer = SimpleImputer(strategy='mean')
data['fare'] = fare_imputer.fit_transform(data[['fare']])


In [65]:
# Drop rows with missing 'sex' data as this column is important for prediction
data = data.dropna(subset=['sex'])

##Data_Encoding##

In [66]:
# Encode 'sex' as a binary variable (0 for male, 1 for female)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['sex'] = encoder.fit_transform(data['sex'])

In [67]:
# Encode 'Embarked' column using LabelEncoder
label_encoder = LabelEncoder()
data["embark_town"] = label_encoder.fit_transform(data["embark_town"])

In [68]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [69]:
# another way of encoding data

# get_dummies takes categorical columns and converts each unique category within those columns into separate binary (0 or 1) columns.

data = pd.get_dummies(data, columns=["sex", "embark_town"], drop_first=True)

In [70]:
data.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_1,embark_town_1,embark_town_2,embark_town_3
0,0,3,22.0,1,0,7.25,True,False,True,False
1,1,1,38.0,1,0,71.2833,False,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True,False
3,1,1,35.0,1,0,53.1,False,False,True,False
4,0,3,35.0,0,0,8.05,True,False,True,False


In [71]:
# Separate features (X) from the target variable (y)
X = data.drop(['survived'], axis=1) # drops the survived column
y = data['survived']

In [72]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
# Function to evaluate models and print accuracy and F1-score
from sklearn.metrics import accuracy_score, f1_score

In [74]:
def evaluate_model(model, X_test, y_test, y_pred):
    """
    Function to evaluate a model by printing accuracy and F1-score.
    """
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model} - Accuracy: {accuracy}")
    print(f"{model} - F1 Score: {f1}")
    return accuracy, f1

In [75]:
# Dictionary to store results for comparison at the end
results = {
    "Algorithm": [],
    "Accuracy": [],
    "F1 Score": []
}

In [76]:
results

{'Algorithm': [], 'Accuracy': [], 'F1 Score': []}

### Apply Algorithms ##

In [77]:
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=1000)

log_model.fit(X_train, y_train)

log_y_pred = log_model.predict(X_test)

log_accuracy, log_f1 = evaluate_model("Logistic Regression", X_test, y_test, log_y_pred)


results["Algorithm"].append("Logistic Regression")
results["Accuracy"].append(log_accuracy)
results["F1 Score"].append(log_f1)

Logistic Regression - Accuracy: 0.7988826815642458
Logistic Regression - F1 Score: 0.7391304347826085


In [78]:
# 2. k-Nearest Neighbors (k-NN)
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_accuracy, knn_f1 = evaluate_model("k-NN", X_test, y_test, knn_y_pred)
results["Algorithm"].append("k-NN")
results["Accuracy"].append(knn_accuracy)
results["F1 Score"].append(knn_f1)

k-NN - Accuracy: 0.7374301675977654
k-NN - F1 Score: 0.6239999999999999


In [79]:
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state=0)
tree_model.fit(X_train, y_train)
tree_y_pred = tree_model.predict(X_test)
tree_accuracy, tree_f1 = evaluate_model("Decision Tree", X_test, y_test, tree_y_pred)
results["Algorithm"].append("Decision Tree")
results["Accuracy"].append(tree_accuracy)
results["F1 Score"].append(tree_f1)

Decision Tree - Accuracy: 0.7821229050279329
Decision Tree - F1 Score: 0.7022900763358778


In [80]:
# 4. Support Vector Machine (SVM)
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy, svm_f1 = evaluate_model("SVM", X_test, y_test, svm_y_pred)
results["Algorithm"].append("SVM")
results["Accuracy"].append(svm_accuracy)
results["F1 Score"].append(svm_f1)

SVM - Accuracy: 0.7206703910614525
SVM - F1 Score: 0.5


In [81]:
# 5. Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)
nb_accuracy, nb_f1 = evaluate_model("Naive Bayes", X_test, y_test, nb_y_pred)
results["Algorithm"].append("Naive Bayes")
results["Accuracy"].append(nb_accuracy)
results["F1 Score"].append(nb_f1)

Naive Bayes - Accuracy: 0.7094972067039106
Naive Bayes - F1 Score: 0.4090909090909091


In [82]:
# 6. Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy, rf_f1 = evaluate_model("Random Forest", X_test, y_test, rf_y_pred)
results["Algorithm"].append("Random Forest")
results["Accuracy"].append(rf_accuracy)
results["F1 Score"].append(rf_f1)

Random Forest - Accuracy: 0.8324022346368715
Random Forest - F1 Score: 0.7692307692307692


In [83]:
# Summarize results in a DataFrame for easy comparison
results_df = pd.DataFrame(results)
print("\nSummary of Model Performance:")
print(results_df)


Summary of Model Performance:
             Algorithm  Accuracy  F1 Score
0  Logistic Regression  0.798883  0.739130
1                 k-NN  0.737430  0.624000
2        Decision Tree  0.782123  0.702290
3                  SVM  0.720670  0.500000
4          Naive Bayes  0.709497  0.409091
5        Random Forest  0.832402  0.769231
