In [None]:
'''Use Naive bayes, K-nearest, and Decision tree classification algorithms to build
classifiers on any two datasets. Pre-process the datasets using techniques specified in
Q2. Compare the Accuracy, Precision, Recall and F1 measure reported for each dataset
using the abovementioned classifiers under the following situations:
i. Using Holdout method (Random sampling):
a) Training set = 80% Test set = 20%
b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
ii. Using Cross-Validation:
a) 10-fold
b) 5-fold'''

In [24]:
#iris dataset

In [25]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalize the data
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_binarized = binarizer.fit_transform(X_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.3, random_state=42)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
print("Naive Bayes Accuracy on Iris:", accuracy_score(y_test, nb_pred))

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print("K-Nearest Neighbors Accuracy on Iris:", accuracy_score(y_test, knn_pred))

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Decision Tree Accuracy on Iris:", accuracy_score(y_test, dt_pred))


Naive Bayes Accuracy on Iris: 0.7555555555555555
K-Nearest Neighbors Accuracy on Iris: 0.6888888888888889
Decision Tree Accuracy on Iris: 0.7555555555555555


In [26]:
# Display metrics for iris
print("metric for iris")
metrics_iris = pd.DataFrame({
    'Algorithm': ['Naive Bayes', 'K-Nearest Neighbors', 'Decision Tree'],
    'Accuracy': [nb_metrics_iris[0], knn_metrics_iris[0], dt_metrics_iris[0]],
    'Precision': [nb_metrics_iris[1], knn_metrics_iris[1], dt_metrics_iris[1]],
    'Recall': [nb_metrics_iris[2], knn_metrics_iris[2], dt_metrics_iris[2]],
    'F1 Score': [nb_metrics_iris[3], knn_metrics_iris[3], dt_metrics_iris[3]]
})



print("Metrics for Iris Dataset:")
print(metrics_iris)

metric for iris
Metrics for Iris Dataset:
             Algorithm  Accuracy  Precision    Recall  F1 Score
0          Naive Bayes  0.755556   0.832371  0.755556  0.689280
1  K-Nearest Neighbors  0.688889   0.743965  0.688889  0.699352
2        Decision Tree  0.755556   0.832371  0.755556  0.689280


In [27]:
#### titanic dataset

In [28]:
import seaborn as sns
from sklearn.impute import SimpleImputer

# Load Titanic dataset
df = sns.load_dataset("titanic")

# Drop rows with missing 'embarked' values
df.dropna(subset=['embarked'], inplace=True)

# Impute missing 'age' values with mean
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

# Convert categorical columns to numeric
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Select features and target
X = df[['age', 'fare', 'sex', 'embarked_Q', 'embarked_S']]
y = df['survived']

# Standardize the data
X_standardized = scaler.fit_transform(X)

# Normalize the data
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
X_binarized = binarizer.fit_transform(X_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.3, random_state=42)

# Naive Bayes
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
print("Naive Bayes Accuracy on Titanic:", accuracy_score(y_test, nb_pred))

# K-Nearest Neighbors
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print("K-Nearest Neighbors Accuracy on Titanic:", accuracy_score(y_test, knn_pred))

# Decision Tree
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Decision Tree Accuracy on Titanic:", accuracy_score(y_test, dt_pred))


Naive Bayes Accuracy on Titanic: 0.7940074906367042
K-Nearest Neighbors Accuracy on Titanic: 0.7940074906367042
Decision Tree Accuracy on Titanic: 0.7940074906367042


In [29]:
# Display metrics for titanic dataset

print("metric for titanic")

# Naive Bayes
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_metrics_titanic = evaluate_model(y_test, nb_pred)

# K-Nearest Neighbors
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_metrics_titanic = evaluate_model(y_test, knn_pred)

# Decision Tree
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_metrics_titanic = evaluate_model(y_test, dt_pred)


# Now display metrics for Titanic dataset
print("\nMetrics for Titanic Dataset:")
metrics_titanic = pd.DataFrame({
    'Algorithm': ['Naive Bayes', 'K-Nearest Neighbors', 'Decision Tree'],
    'Accuracy': [nb_metrics_titanic[0], knn_metrics_titanic[0], dt_metrics_titanic[0]],
    'Precision': [nb_metrics_titanic[1], knn_metrics_titanic[1], dt_metrics_titanic[1]],
    'Recall': [nb_metrics_titanic[2], knn_metrics_titanic[2], dt_metrics_titanic[2]],
    'F1 Score': [nb_metrics_titanic[3], knn_metrics_titanic[3], dt_metrics_titanic[3]]
})
print(metrics_titanic)


metric for titanic

Metrics for Titanic Dataset:
             Algorithm  Accuracy  Precision    Recall  F1 Score
0          Naive Bayes  0.794007   0.794437  0.794007  0.794211
1  K-Nearest Neighbors  0.794007   0.794437  0.794007  0.794211
2        Decision Tree  0.794007   0.794437  0.794007  0.794211


In [None]:
### iris using holdout

In [30]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalize the data
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_binarized = binarizer.fit_transform(X_normalized)

# Function to train and evaluate models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    
    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    results['Naive Bayes'] = {
        'Accuracy': accuracy_score(y_test, nb_pred),
        'Precision': precision_score(y_test, nb_pred, average='weighted'),
        'Recall': recall_score(y_test, nb_pred, average='weighted'),
        'F1 Score': f1_score(y_test, nb_pred, average='weighted')
    }
    
    # K-Nearest Neighbors
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    results['K-Nearest Neighbors'] = {
        'Accuracy': accuracy_score(y_test, knn_pred),
        'Precision': precision_score(y_test, knn_pred, average='weighted'),
        'Recall': recall_score(y_test, knn_pred, average='weighted'),
        'F1 Score': f1_score(y_test, knn_pred, average='weighted')
    }
    
    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    dt_pred = dt.predict(X_test)
    results['Decision Tree'] = {
        'Accuracy': accuracy_score(y_test, dt_pred),
        'Precision': precision_score(y_test, dt_pred, average='weighted'),
        'Recall': recall_score(y_test, dt_pred, average='weighted'),
        'F1 Score': f1_score(y_test, dt_pred, average='weighted')
    }
    
    return results

# Using Holdout method (Random sampling):
# a) Training set = 80%, Test set = 20%
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.2, random_state=42)
results_80_20 = evaluate_models(X_train, X_test, y_train, y_test)

# b) Training set = 66.6%, Test set = 33.3%
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.333, random_state=42)
results_66_33 = evaluate_models(X_train, X_test, y_train, y_test)

# Print results
print("Results for 80% Training, 20% Test:")
print(pd.DataFrame(results_80_20).transpose())

print("\nResults for 66.6% Training, 33.3% Test:")
print(pd.DataFrame(results_66_33).transpose())


Results for 80% Training, 20% Test:
                     Accuracy  Precision    Recall  F1 Score
Naive Bayes          0.766667   0.825299  0.766667  0.709202
K-Nearest Neighbors  0.700000   0.765385  0.700000  0.699206
Decision Tree        0.766667   0.825299  0.766667  0.709202

Results for 66.6% Training, 33.3% Test:
                     Accuracy  Precision  Recall  F1 Score
Naive Bayes              0.74   0.818713    0.74  0.664154
K-Nearest Neighbors      0.74   0.727246    0.74  0.700257
Decision Tree            0.74   0.818713    0.74  0.664154


In [None]:
### titanci using holdout

In [31]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Titanic dataset
df = sns.load_dataset("titanic")

# Drop rows with missing 'embarked' values
df.dropna(subset=['embarked'], inplace=True)

# Impute missing 'age' values with mean
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

# Convert categorical columns to numeric
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Select features and target
X = df[['age', 'fare', 'sex', 'embarked_Q', 'embarked_S']]
y = df['survived']

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalize the data
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_binarized = binarizer.fit_transform(X_normalized)

# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Function to train and evaluate models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    
    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    results['Naive Bayes'] = evaluate_model(y_test, nb_pred)
    
    # K-Nearest Neighbors
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    results['K-Nearest Neighbors'] = evaluate_model(y_test, knn_pred)
    
    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    dt_pred = dt.predict(X_test)
    results['Decision Tree'] = evaluate_model(y_test, dt_pred)
    
    return results

# Using Holdout method (Random sampling):
# a) Training set = 80%, Test set = 20%
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.2, random_state=42)
results_80_20 = evaluate_models(X_train, X_test, y_train, y_test)

# b) Training set = 66.6%, Test set = 33.3%
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.333, random_state=42)
results_66_33 = evaluate_models(X_train, X_test, y_train, y_test)

# Print results
print("Results for 80% Training, 20% Test:")
print(pd.DataFrame(results_80_20, index=['Accuracy', 'Precision', 'Recall', 'F1 Score']).transpose())

print("\nResults for 66.6% Training, 33.3% Test:")
print(pd.DataFrame(results_66_33, index=['Accuracy', 'Precision', 'Recall', 'F1 Score']).transpose())


Results for 80% Training, 20% Test:
                     Accuracy  Precision    Recall  F1 Score
Naive Bayes          0.803371   0.806911  0.803371  0.804501
K-Nearest Neighbors  0.696629   0.713271  0.696629  0.658234
Decision Tree        0.803371   0.806911  0.803371  0.804501

Results for 66.6% Training, 33.3% Test:
                     Accuracy  Precision    Recall  F1 Score
Naive Bayes          0.797980   0.797980  0.797980  0.797980
K-Nearest Neighbors  0.525253   0.683961  0.525253  0.495067
Decision Tree        0.797980   0.797980  0.797980  0.797980


In [None]:
### iris using k fold

In [37]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalize the data
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_binarized = binarizer.fit_transform(X_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.3, random_state=42)

# Function to perform cross-validation and print results
def cross_validation(clf, X, y, cv_folds):
    scores = cross_val_score(clf, X, y, cv=cv_folds, scoring='accuracy')
    print(f"{clf.__class__.__name__} Accuracy with {cv_folds}-fold Cross-Validation:", scores.mean())

# Naive Bayes
nb = GaussianNB()
cross_validation(nb, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(nb, X_binarized, y, 5)   # 5-fold cross-validation

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
cross_validation(knn, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(knn, X_binarized, y, 5)   # 5-fold cross-validation

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
cross_validation(dt, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(dt, X_binarized, y, 5)   # 5-fold cross-validation


GaussianNB Accuracy with 10-fold Cross-Validation: 0.7933333333333333
GaussianNB Accuracy with 5-fold Cross-Validation: 0.7933333333333332
KNeighborsClassifier Accuracy with 10-fold Cross-Validation: 0.64
KNeighborsClassifier Accuracy with 5-fold Cross-Validation: 0.6399999999999999
DecisionTreeClassifier Accuracy with 10-fold Cross-Validation: 0.7866666666666667
DecisionTreeClassifier Accuracy with 5-fold Cross-Validation: 0.7866666666666666


In [None]:
### titanic using k fold

In [38]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Titanic dataset
df = sns.load_dataset("titanic")

# Drop rows with missing 'embarked' values
df.dropna(subset=['embarked'], inplace=True)

# Impute missing 'age' values with mean
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

# Convert categorical columns to numeric
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Select features and target
X = df[['age', 'fare', 'sex', 'embarked_Q', 'embarked_S']]
y = df['survived']

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalize the data
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)

# Discretize the data
binarizer = Binarizer(threshold=0.5)
X_binarized = binarizer.fit_transform(X_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.3, random_state=42)

# Function to perform cross-validation and print results
def cross_validation(clf, X, y, cv_folds):
    scores = cross_val_score(clf, X, y, cv=cv_folds, scoring='accuracy')
    print(f"{clf.__class__.__name__} Accuracy with {cv_folds}-fold Cross-Validation:", scores.mean())

# Naive Bayes
nb = GaussianNB()
cross_validation(nb, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(nb, X_binarized, y, 5)   # 5-fold cross-validation

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
cross_validation(knn, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(knn, X_binarized, y, 5)   # 5-fold cross-validation

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
cross_validation(dt, X_binarized, y, 10)  # 10-fold cross-validation
cross_validation(dt, X_binarized, y, 5)   # 5-fold cross-validation

# Existing code to evaluate on single train-test split for comparison
# Naive Bayes
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
print("Naive Bayes Accuracy on Titanic:", accuracy_score(y_test, nb_pred))

# K-Nearest Neighbors
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print("K-Nearest Neighbors Accuracy on Titanic:", accuracy_score(y_test, knn_pred))

# Decision Tree
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Decision Tree Accuracy on Titanic:", accuracy_score(y_test, dt_pred))


GaussianNB Accuracy with 10-fold Cross-Validation: 0.7862487231869254
GaussianNB Accuracy with 5-fold Cross-Validation: 0.7806640005078398
KNeighborsClassifier Accuracy with 10-fold Cross-Validation: 0.7626021450459652
KNeighborsClassifier Accuracy with 5-fold Cross-Validation: 0.7367866438138767
DecisionTreeClassifier Accuracy with 10-fold Cross-Validation: 0.7873723186925433
DecisionTreeClassifier Accuracy with 5-fold Cross-Validation: 0.7874055735415476
Naive Bayes Accuracy on Titanic: 0.7940074906367042
K-Nearest Neighbors Accuracy on Titanic: 0.7940074906367042
Decision Tree Accuracy on Titanic: 0.7940074906367042
