# 1. Decision Trees and Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles
from sklearn.inspection import DecisionBoundaryDisplay 
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

In [None]:

# A moon shaped synthetic dataset with noise
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

classifiers = []

# Add decision trees with varying depth
for depth in [1, 2, 5, 10]:
    classifiers.append(DecisionTreeClassifier(max_depth=depth))

# Add Gaussian Naive Bayes with different var smoothing
for var_smoothing in [1e-9, 1e-5, 1e-1]:
    classifiers.append(GaussianNB(var_smoothing=var_smoothing))

# Train and evaluate all models
for clf in classifiers:
    clf.fit(X_train, y_train)
    print(f'Classifier: {clf.__class__.__name__}, Parameters: {clf.get_params()}')
    print(f'Train Accuracy: {clf.score(X_train, y_train)}')
    print(f'Test Accuracy: {clf.score(X_test, y_test)}')
    print('-----------------------------------------------------------')


For decision trees, a depth of 1 may lead to underfitting (poor performance) because the model is too simple and can't capture the complexity of the data.

A depth of 10, on the other hand, may lead to overfitting (great performance on the training set, but poor on the test set) because the model is too complex and memorizes the training data, rather than learning from it.

The optimal depth should balance between these extremes.

In [None]:
# Create datasets
datasets = [make_moons(), make_circles()]

# Create classifiers with different parameters
classifiers = [
    DecisionTreeClassifier(max_depth=1),  # Decision Tree with max depth 1
    DecisionTreeClassifier(),  # Default Decision Tree
    DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=5),
    DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0.1),
    DecisionTreeClassifier(min_impurity_decrease=0.2),   
    make_pipeline(StandardScaler(), GaussianNB())  # Naive Bayes with StandardScaler preprocessing
]


classifier_names = [
    'D-Tree (Max Depth 1)',
    'D-Tree (default)',
    'D-Tree (Custom Parameters)',
    'D-Tree (Weighted Leaf)',
    'D-Tree (Impurity Decrease)',
    'GaussianNB',


]
figure = plt.figure(figsize=(15, 10))
i = 1

# Iterate over datasets
for ds_count, ds in enumerate(datasets):
    X, y = ds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

    if ds_count == 0:
        ax.set_title('Input data')

    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k')
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k")
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    for name, clf in zip(classifier_names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        DecisionBoundaryDisplay.from_estimator(clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5)

        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k", alpha=0.6)
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())

        if ds_count == 0:
            ax.set_title(name)
        ax.text(x_max - 0.3, y_min + 0.3, ("%.2f" % score).lstrip("0"), size=15, horizontalalignment="right")
        i += 1

plt.tight_layout()
plt.show()


Model Assumptions: Different classifiers make different assumptions about the data. For example, Naive Bayes assumes that the features are conditionally independent given the class label, while decision trees make hierarchical decisions based on feature thresholds. These assumptions can lead to different model behaviors and performance depending on the dataset.

Dataset Characteristics: The characteristics of the dataset, such as the number of features, the number of samples, and the complexity of the underlying patterns, can influence the performance of different classifiers. Some classifiers may perform well on linearly separable datasets, while others may handle complex nonlinear relationships better.




The changes in the behavior of different models can be attributed to the parameters you set. For instance:

The depth of the Decision Tree affects its complexity and ability to capture intricate patterns.

Different Naive Bayes models make different assumptions about the distribution of data.

Parameters like min_samples_split, min_samples_leaf, and min_impurity_decrease control the tree's structure and stopping criteria.
min_weight_fraction_leaf and min_impurity_decrease allow us to set thresholds for node splitting.

# 2. Decision Tree Evaluation

In [None]:
import yaml
# Load the configuration from the YAML file
with open("config.yml", "r") as file:
    config = yaml.safe_load(file)

# Get the dataset path from the configuration
dataset = config["dataset"]["path"]

# Load the dataset
data = pd.read_csv(dataset)
data.head()

In [None]:
data[data.columns].isnull().sum()

In [None]:
print(data.info())

In [None]:
print(data.describe())

In [None]:
# Separate features and target variable
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Perform feature scaling using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Check the skewness of the features 
X = pd.DataFrame(X)
X.skew() 

understand the most important features in the classification task

In [None]:

# Modeling
dtc = DecisionTreeClassifier(max_depth=2)
scores = cross_val_score(dtc, X, y, cv=5)

print(f'Cross-validation mean accuracy: {scores.mean()}')

# Training
dtc.fit(X, y)

# Evaluation
print(f'Training set accuracy: {dtc.score(X, y)}')

# Explanation
plt.figure(figsize=(10, 7))
plot_tree(dtc, filled=True, feature_names=data.columns, class_names=['Malignant', 'Benign'])
plt.show()

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a decision tree classifier
dtc = DecisionTreeClassifier()

# Train the decision tree classifier
dtc.fit(X_train, y_train)

# Predict on the test set
y_pred = dtc.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print accuracy and confusion matrix
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)

In [None]:
# Create a decision tree classifier
data_malignant = data[data['diagnosis'] == 'M']
data_benign = data[data['diagnosis'] == 'B']

# Create a scatter plot for malignant tumors
plt.scatter(data_malignant['radius_mean'], data_malignant['texture_mean'], color='purple', alpha=0.5, label='Malignant')
# Create a scatter plot for benign tumors
plt.scatter(data_benign['radius_mean'], data_benign['texture_mean'], color='lime', alpha= 0.5, label='Benign')


plt.xlabel('Radius')
plt.ylabel('Texture')
plt.title(' Average Radius vs Texture')

plt.legend()
plt.show()


In a decision tree classifier, the algorithm makes sequential choices based on feature values, dividing the data at each node according to a certain feature threshold. This continues until a termination condition, like reaching the set maximum depth or the minimum sample split limit, is fulfilled.

Features closer to the root node, appearing earlier in the tree, usually hold more importance as they contribute to major data splits. This is not a hard rule, but a general trend. By examining the tree structure and the order of feature splits, we can make educated guesses about which features are more crucial to the classification.

Moreover, scikit-learn's decision tree implementation offers a feature_importances_ attribute, which can be used to gain quantitative insights into the importance of each feature. This attribute represents the amount each feature decreases the weighted impurity.

By examining these metrics and the structure of the decision tree, we can gain a deeper understanding of the main drivers behind the classification of cases as malignant or benign in the context of the breast cancer dataset. This understanding could prove valuable in determining potential markers for cancer or figuring out the major contributing factors in cancer diagnosis.

In [None]:
# Create a tree plot for malignant tumors
fig, ax = plt.subplots(figsize=(25, 20))

tree.plot_tree(dtc, 
               feature_names=X.columns, 
               class_names=['Benign', 'Malignant'], 
               filled=True, 
               fontsize=15, 
               label='root', 
               ax=ax)

# Show the plot
plt.title("Decision Tree for Breast Cancer Classification", fontsize=20)
plt.show()

-----

# 3. Naive Bayes

During the Corona pandemic, seven roommates in a student house did a Corona test. The table below show the data of these students: whether they experiences shivers, had a running nose, or had a headache. The test result is also shown.

Roommate | shivers | running nose | headache | test result
--|--|--|--|--
1 | Y | N | No | Negative
2 | N | N | Mild | Negative
3 | Y | Y | No | Positive
4 | N | Y | No | Negative
5 | N | N | Heavy | Positive
6 | Y | N | No | Negative
7 | Y | Y | Mild | Positive

First, the Roommate column shouldn't be included because it's just an identifier for each student and doesn't carry any meaningful information for predicting the test result.

Including such an identifier as a feature might mislead the classifier into thinking that different roommates have inherent predictive value, which is not the case.

In [None]:

# Create a pandas DataFrame with the given data
data = {
    'shivers': ['Y', 'N', 'Y', 'N', 'N', 'Y', 'Y'],
    'running nose': ['N', 'N', 'Y', 'Y', 'N', 'N', 'Y'],
    'headache': ['No', 'Mild', 'No', 'No', 'Heavy', 'No', 'Mild'],
    'test result': ['Negative', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive']
}
df = pd.DataFrame(data)
 
 # Transform nominal data into binary columns
df_encoded = pd.get_dummies(df.drop('test result', axis=1))

# Extract features and target variable
X = df_encoded.values
y = df['test result']

# Train the Categorical Naive Bayes classifier
clf = CategoricalNB()
clf.fit(df_encoded, y)

In [None]:
# Predict the test results
predictions = clf.predict(X)

# Calculate accuracy
accuracy = accuracy_score(y, predictions)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Manually calculate the predicted probabilities for the 5th observation
observation_5 = df_encoded.iloc[4, :]  # Extract the features of the 5th observation

# Predict the probabilities for each class
proba = clf.predict_proba([observation_5])

# Extract the probability for the Negative class (index 0) and Positive class (index 1)
probability_negative = proba[0, 0]
probability_positive = proba[0, 1]

print("Probability of Negative class:", probability_negative)
print("Probability of Positive class:", probability_positive)


The actual result for observation 5 is "Positive," but the classifier predicted it to be "Negative." This is an error made by the classifier, and it happened because even though the "Negative" probability was higher, it was still wrong in this case.

In simpler terms, the classifier thought that observation 5 was more likely to be "Negative," and it chose that option even though it was actually "Positive." This shows that sometimes a higher probability doesn't necessarily mean the prediction is correct.




