# 1. Decision Trees and Naive bayes

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.datasets import make_moons, make_circles
from sklearn.inspection import DecisionBoundaryDisplay  
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:

# A moon shaped synthetic dataset with noise
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

classifiers = []

# Add decision trees with varying depth
for depth in [1, 2, 5, 10]:
    classifiers.append(DecisionTreeClassifier(max_depth=depth))

# Add Gaussian Naive Bayes with different var smoothing
for var_smoothing in [1e-9, 1e-5, 1e-1]:
    classifiers.append(GaussianNB(var_smoothing=var_smoothing))

# Train and evaluate all models
for clf in classifiers:
    clf.fit(X_train, y_train)
    print(f'Classifier: {clf.__class__.__name__}, Parameters: {clf.get_params()}')
    print(f'Train Accuracy: {clf.score(X_train, y_train)}')
    print(f'Test Accuracy: {clf.score(X_test, y_test)}')
    print('-----------------------------------------------------------')


For decision trees, a depth of 1 may lead to underfitting (poor performance) because the model is too simple and can't capture the complexity of the data.

A depth of 10, on the other hand, may lead to overfitting (great performance on the training set, but poor on the test set) because the model is too complex and memorizes the training data, rather than learning from it.

The optimal depth should balance between these extremes.

In [None]:
datasets = [make_moons(), make_circles()]
names = ['Decision Tree', 'Naive Bayes']
classifiers = [
    tree.DecisionTreeClassifier(),  # Original decision tree
    tree.DecisionTreeClassifier(max_depth = 1),  # Decision tree with max depth of 1
    tree.DecisionTreeClassifier(min_samples_leaf=10),  # Decision tree with minimum samples per leaf set to 10
    GaussianNB()  # Naive Bayes classifier
]


In [None]:
figure = plt.figure(figsize=(10, 5))

i = 1

# iterate over datasets
for ds_count, ds in enumerate(datasets):
    X, y = ds
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=0
    )
    
    # determining min point and max point and add margin
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    
    # plot dataset
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    
    # plot original
    if ds_count == 0:
        ax.set_title('Input data')
        
    # plot training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
    
    # plot testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k")
    
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1
    
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        
        # make pipeline to make it easier
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )
        
        # plot training points
        ax.scatter(
            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # plot testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        
        if ds_count == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )
        i += 1
        
plt.tight_layout()
plt.show()


Model Assumptions: Different classifiers make different assumptions about the data. For example, Naive Bayes assumes that the features are conditionally independent given the class label, while decision trees make hierarchical decisions based on feature thresholds. These assumptions can lead to different model behaviors and performance depending on the dataset.

Dataset Characteristics: The characteristics of the dataset, such as the number of features, the number of samples, and the complexity of the underlying patterns, can influence the performance of different classifiers. Some classifiers may perform well on linearly separable datasets, while others may handle complex nonlinear relationships better.

Model Complexity: The complexity of the model, controlled by parameters like the maximum depth of a decision tree or the number of neighbors in a k-nearest neighbors classifier, can affect its performance. A more complex model may have higher capacity and can fit the training data more closely, but it may also be prone to overfitting if the dataset is small or noisy.

Feature Scaling and Preprocessing: Different classifiers may have different sensitivity to the scale and distribution of features. Some classifiers, like Naive Bayes, assume that the features follow a Gaussian distribution, while others, like decision trees, are less sensitive to feature scaling. Preprocessing steps such as standardization or normalization can influence the performance of different classifiers.

Hyperparameter Tuning: The performance of a classifier can be influenced by the choice of hyperparameters. Hyperparameters control the behavior of the model and are not learned from the data. Grid search or other optimization techniques can be used to find the optimal hyperparameter values, which can significantly impact the model's performance.

# 2. Decision Tree Evaluation

In [None]:
# Load the dataset
data = pd.read_csv(r'../Data/breast-cancer.csv')

data.head() 

In [None]:
print(data.info())

In [None]:
print(data.describe())

In [None]:
# Separate features and target variable
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Perform feature scaling using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Check the skewness of the features 
X = pd.DataFrame(X)
X.skew() 

understand the most important features in the classification task

In [None]:

# Modeling
dtc = DecisionTreeClassifier(max_depth=2)
scores = cross_val_score(dtc, X, y, cv=5)

print(f'Cross-validation mean accuracy: {scores.mean()}')

# Training
dtc.fit(X, y)

# Evaluation
print(f'Training set accuracy: {dtc.score(X, y)}')

# Explanation
plt.figure(figsize=(10, 7))
plot_tree(dtc, filled=True, feature_names=data.columns, class_names=['Malignant', 'Benign'])
plt.show()

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Train the decision tree classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print accuracy and confusion matrix
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)

In a decision tree classifier, the algorithm makes sequential choices based on feature values, dividing the data at each node according to a certain feature threshold. This continues until a termination condition, like reaching the set maximum depth or the minimum sample split limit, is fulfilled.

Features closer to the root node, appearing earlier in the tree, usually hold more importance as they contribute to major data splits. This is not a hard rule, but a general trend. By examining the tree structure and the order of feature splits, we can make educated guesses about which features are more crucial to the classification.

Moreover, scikit-learn's decision tree implementation offers a feature_importances_ attribute, which can be used to gain quantitative insights into the importance of each feature. This attribute represents the amount each feature decreases the weighted impurity.

By examining these metrics and the structure of the decision tree, we can gain a deeper understanding of the main drivers behind the classification of cases as malignant or benign in the context of the breast cancer dataset. This understanding could prove valuable in determining potential markers for cancer or figuring out the major contributing factors in cancer diagnosis.

# 3. Naive Bayes

First, the Roommate column shouldn't be included because it's just an identifier for each student and doesn't carry any meaningful information for predicting the test result.

In [None]:

# Create a pandas DataFrame with the given data
data = {
    'shivers': ['Y', 'N', 'Y', 'N', 'N', 'Y', 'Y'],
    'running nose': ['N', 'N', 'Y', 'Y', 'N', 'N', 'Y'],
    'headache': ['No', 'Mild', 'No', 'No', 'Heavy', 'No', 'Mild'],
    'test result': ['Negative', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive']
}
df = pd.DataFrame(data)
 
df_encoded = pd.get_dummies(df.drop('test result', axis=1))

# Extract the target variable
y = df['test result']

# Train the Categorical Naive Bayes classifier
clf = CategoricalNB()
clf.fit(df_encoded, y)

In [None]:
# Manually calculate the predicted probabilities for the 5th observation
observation_5 = df_encoded.iloc[4, :]  # Extract the features of the 5th observation

# Predict the probabilities for each class
proba = clf.predict_proba([observation_5])

# Extract the probability for the Negative class (index 0) and Positive class (index 1)
probability_negative = proba[0, 0]
probability_positive = proba[0, 1]

print("Probability of Negative class:", probability_negative)
print("Probability of Positive class:", probability_positive)


To manually calculate the prediction for observation 5, we use Bayes' theorem. Note that we need to calculate the probabilities for each feature given both classes (Positive and Negative) and then calculate the total probability for both classes. After that, we normalize these probabilities so they sum to 1. The class with the highest probability is the predicted class. This is the method used by Naive Bayes.