In [None]:
"""
1. What is a Support Vector Machine (SVM)?
   - A supervised ML algorithm that finds the hyperplane separating classes with the largest margin.

2. What is the difference between Hard Margin and Soft Margin SVM?
   - Hard Margin: Perfect separation, no errors allowed (requires linearly separable data).
   - Soft Margin: Allows misclassification for better generalization.

3. What is the mathematical intuition behind SVM?
   - Maximize the margin between classes while minimizing classification error.

4. What is the role of Lagrange Multipliers in SVM?
   - Solve the constrained optimization problem for finding the optimal hyperplane.

5. What are Support Vectors in SVM?
   - Data points closest to the decision boundary that define its position.

6. What is a Support Vector Classifier (SVC)?
   - Classification variant of SVM.

7. What is a Support Vector Regressor (SVR)?
   - Regression variant of SVM that predicts continuous values using an ε-insensitive margin.

8. What is the Kernel Trick in SVM?
   - Maps data into higher-dimensional space to make it linearly separable without explicit computation.

9. Compare Linear Kernel, Polynomial Kernel, and RBF Kernel:
   - Linear: Best for linearly separable data.
   - Polynomial: Captures polynomial relationships.
   - RBF: Captures complex nonlinear patterns via Gaussian similarity.

10. What is the effect of the C parameter in SVM?
    - Controls trade-off between margin size and classification error (low C = wider margin, high C = fewer errors).

11. What is the role of the Gamma parameter in RBF Kernel SVM?
    - Determines influence of a single training point (low gamma = far reach, high gamma = close reach).

12. What is the Naïve Bayes classifier, and why is it called "Naïve"?
    - Probabilistic classifier assuming feature independence; "naïve" due to unrealistic independence assumption.

13. What is Bayes’ Theorem?
    - P(A|B) = [P(B|A) * P(A)] / P(B)

14. Explain the differences between Gaussian, Multinomial, and Bernoulli Naïve Bayes:
    - Gaussian: Continuous, normally distributed features.
    - Multinomial: Discrete counts (e.g., word counts).
    - Bernoulli: Binary/boolean features.

15. When should you use Gaussian Naïve Bayes over other variants?
    - When features are continuous and approximately Gaussian.

16. What are the key assumptions made by Naïve Bayes?
    - Features are conditionally independent given the class.

17. What are the advantages and disadvantages of Naïve Bayes?
    - Advantages: Fast, works with small data, good for high-dimensional text.
    - Disadvantages: Independence assumption often violated.

18. Why is Naïve Bayes a good choice for text classification?
    - Word frequencies are approximately independent given the class; works well on sparse high-dimensional data.

19. Compare SVM and Naïve Bayes for classification tasks:
    - SVM: Margin-based, good for complex boundaries, slower on very large data.
    - NB: Probabilistic, faster for large text datasets, less effective for complex boundaries.

20. How does Laplace Smoothing help in Naïve Bayes?
    - Adds a constant (usually 1) to counts to prevent zero probability issues.
"""


In [None]:
# ===============================
# 1 (#H): Write a Python program to train an SVM Classifier on the Iris dataset and evaluate accuracy
# ===============================
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"1 (#H) Iris SVM Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# ===============================
# 2 (##): Train two SVM classifiers with Linear and RBF kernels on Wine dataset, compare accuracies
# ===============================
wine = datasets.load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3, random_state=42)
clf_linear = SVC(kernel='linear')
clf_rbf = SVC(kernel='rbf')
clf_linear.fit(X_train, y_train)
clf_rbf.fit(X_train, y_train)
acc_linear = accuracy_score(y_test, clf_linear.predict(X_test))
acc_rbf = accuracy_score(y_test, clf_rbf.predict(X_test))
print(f"2 (##) Wine SVM Linear Accuracy: {acc_linear:.4f}, RBF Accuracy: {acc_rbf:.4f}")

# ===============================
# 3 (#$): SVM Regressor (SVR) on California housing dataset, evaluate with MSE
# ===============================
from sklearn.datasets import fetch_california_housing
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.3, random_state=42)
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print(f"3 (#$) SVR Housing MSE: {mean_squared_error(y_test, y_pred):.4f}")

# ===============================
# 4 (#%): SVM with Polynomial Kernel, visualize decision boundary
# ===============================
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=42, n_clusters_per_class=1)
clf_poly = SVC(kernel='poly', degree=3)
clf_poly.fit(X, y)

plt.figure()
plt.title("4 (#%) SVM Polynomial Kernel Decision Boundary")
xx, yy = np.meshgrid(np.linspace(X[:,0].min()-1, X[:,0].max()+1, 200),
                     np.linspace(X[:,1].min()-1, X[:,1].max()+1, 200))
Z = clf_poly.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3)
plt.scatter(X[:,0], X[:,1], c=y, edgecolor='k')
plt.show()

# ===============================
# 5 (#!): Gaussian Naïve Bayes on Breast Cancer dataset, evaluate accuracy
# ===============================
from sklearn.naive_bayes import GaussianNB

cancer = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print(f"5 (#!) GaussianNB Breast Cancer Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# ===============================
# 6 (#;): Multinomial Naïve Bayes for text classification (20 Newsgroups)
# ===============================
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(newsgroups.data)
mnb = MultinomialNB()
mnb.fit(X_vec, newsgroups.target)
acc = mnb.score(X_vec, newsgroups.target)
print(f"6 (#;) MultinomialNB 20 Newsgroups Training Accuracy: {acc:.4f}")

# ===============================
# 7 (?=): SVM with different C values, visualize decision boundaries
# ===============================
C_values = [0.1, 1, 10]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=42, n_clusters_per_class=1)

plt.figure(figsize=(12,4))
for i, C in enumerate(C_values, 1):
    plt.subplot(1, 3, i)
    clf_c = SVC(C=C, kernel='linear')
    clf_c.fit(X, y)
    xx, yy = np.meshgrid(np.linspace(X[:,0].min()-1, X[:,0].max()+1, 200),
                         np.linspace(X[:,1].min()-1, X[:,1].max()+1, 200))
    Z = clf_c.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(X[:,0], X[:,1], c=y, edgecolor='k')
    plt.title(f"C={C}")
plt.suptitle("7 (?=) SVM Decision Boundaries with Different C Values")
plt.show()

# ===============================
# 8 (%=): Bernoulli Naïve Bayes for binary classification
# ===============================
from sklearn.naive_bayes import BernoulliNB

X_bin = np.random.randint(0, 2, size=(100, 5))
y_bin = np.random.randint(0, 2, size=(100,))
bnb = BernoulliNB()
bnb.fit(X_bin, y_bin)
print(f"8 (%=) BernoulliNB Accuracy: {bnb.score(X_bin, y_bin):.4f}")

# ===============================
# 9 (=): Apply feature scaling before training SVM, compare with unscaled
# ===============================
from sklearn.preprocessing import StandardScaler

X, y = make_classification(n_samples=200, n_features=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_unscaled = SVC()
clf_unscaled.fit(X_train, y_train)
acc_unscaled = clf_unscaled.score(X_test, y_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_scaled = SVC()
clf_scaled.fit(X_train_scaled, y_train)
acc_scaled = clf_scaled.score(X_test_scaled, y_test)

print(f"9 (=) Unscaled Accuracy: {acc_unscaled:.4f}, Scaled Accuracy: {acc_scaled:.4f}")

# ===============================
# 10 (*=): GaussianNB predictions before & after Laplace smoothing
# ===============================
gnb = GaussianNB(var_smoothing=1e-9)
gnb.fit(X_train, y_train)
pred_before = gnb.score(X_test, y_test)
gnb_smooth = GaussianNB(var_smoothing=1e-2)
gnb_smooth.fit(X_train, y_train)
pred_after = gnb_smooth.score(X_test, y_test)
print(f"10 (*=) Accuracy before smoothing: {pred_before:.4f}, after smoothing: {pred_after:.4f}")

# ===============================
# 11 (=): SVM + GridSearchCV to tune C, gamma, kernel
# ===============================
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 0.1, 1], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, cv=3)
grid.fit(X_train_scaled, y_train)
print(f"11 (=) Best Params: {grid.best_params_}, Best Score: {grid.best_score_:.4f}")

# ===============================
# 12 (=): SVM on imbalanced dataset with class_weight
# ===============================
from sklearn.datasets import make_classification
X_imb, y_imb = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1],
                                   n_informative=3, n_redundant=0, flip_y=0,
                                   n_features=5, n_clusters_per_class=1, n_samples=200, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X_imb, y_imb, random_state=42)
clf_no_weight = SVC()
clf_weight = SVC(class_weight='balanced')
clf_no_weight.fit(X_train, y_train)
clf_weight.fit(X_train, y_train)
print(f"12 (=) No weight Acc: {clf_no_weight.score(X_test, y_test):.4f}, Weighted Acc: {clf_weight.score(X_test, y_test):.4f}")

# ===============================
# 13 (=): Naïve Bayes spam detection (synthetic example)
# ===============================
from sklearn.feature_extraction.text import CountVectorizer

emails = ["Win money now", "Cheap meds online", "Hi friend how are you", "Meeting tomorrow at office"]
labels = [1, 1, 0, 0]  # 1 = spam, 0 = ham
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(emails)
mnb = MultinomialNB()
mnb.fit(X_vec, labels)
print(f"13 (=) Spam Detection Predictions: {mnb.predict(vectorizer.transform(['Win a free iPhone', 'See you at lunch']))}")

# ===============================
# 14 (=): SVM vs Naïve Bayes on same dataset
# ===============================
X, y = make_classification(n_samples=300, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
svm_model = SVC()
nb_model = GaussianNB()
svm_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)
print(f"14 (=) SVM Acc: {svm_model.score(X_test, y_test):.4f}, Naïve Bayes Acc: {nb_model.score(X_test, y_test):.4f}")


In [None]:
# ===============================
# 15 (>=): Perform feature selection before Naïve Bayes and compare results
# ===============================
from sklearn.feature_selection import SelectKBest, f_classif

X, y = make_classification(n_samples=300, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
acc_before = nb_model.score(X_test, y_test)

selector = SelectKBest(score_func=f_classif, k=10)
X_train_sel = selector.fit_transform(X_train, y_train)
X_test_sel = selector.transform(X_test)
nb_model_sel = GaussianNB()
nb_model_sel.fit(X_train_sel, y_train)
acc_after = nb_model_sel.score(X_test_sel, y_test)
print(f"15 (>=) NB Accuracy before FS: {acc_before:.4f}, after FS: {acc_after:.4f}")

# ===============================
# 16 (<=): SVM OvR vs OvO on Wine dataset
# ===============================
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

wine = datasets.load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, random_state=42)
ovr = OneVsRestClassifier(SVC())
ovo = OneVsOneClassifier(SVC())
ovr.fit(X_train, y_train)
ovo.fit(X_train, y_train)
print(f"16 (<=) OvR Accuracy: {ovr.score(X_test, y_test):.4f}, OvO Accuracy: {ovo.score(X_test, y_test):.4f}")

# ===============================
# 17 (?=): SVM Linear, Poly, RBF on Breast Cancer dataset
# ===============================
cancer = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)
kernels = ['linear', 'poly', 'rbf']
for k in kernels:
    model = SVC(kernel=k)
    model.fit(X_train, y_train)
    print(f"17 (?=) Kernel={k} Accuracy: {model.score(X_test, y_test):.4f}")

# ===============================
# 18 (%=): SVM with Stratified K-Fold CV
# ===============================
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(SVC(), cancer.data, cancer.target, cv=skf)
print(f"18 (%=) Stratified K-Fold Avg Accuracy: {scores.mean():.4f}")

# ===============================
# 19 (=): Naïve Bayes with different priors
# ===============================
priors_list = [[0.3, 0.7], [0.5, 0.5], [0.6, 0.4]]
for priors in priors_list:
    model = GaussianNB(priors=priors)
    model.fit(X_train, y_train)
    print(f"19 (=) Priors={priors}, Accuracy: {model.score(X_test, y_test):.4f}")

# ===============================
# 20 (*=): RFE before SVM and compare accuracy
# ===============================
from sklearn.feature_selection import RFE

X, y = make_classification(n_samples=300, n_features=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
acc_before = svc.score(X_test, y_test)

selector = RFE(svc, n_features_to_select=8)
selector.fit(X_train, y_train)
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)
svc_sel = SVC(kernel='linear')
svc_sel.fit(X_train_sel, y_train)
acc_after = svc_sel.score(X_test_sel, y_test)
print(f"20 (*=) SVM Accuracy before RFE: {acc_before:.4f}, after RFE: {acc_after:.4f}")

# ===============================
# 21 (=): SVM with Precision, Recall, F1
# ===============================
from sklearn.metrics import precision_score, recall_score, f1_score

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"21 (=) Precision: {precision_score(y_test, y_pred):.4f}, Recall: {recall_score(y_test, y_pred):.4f}, F1: {f1_score(y_test, y_pred):.4f}")

# ===============================
# 22 (=): Naïve Bayes with Log Loss
# ===============================
from sklearn.metrics import log_loss

model = GaussianNB()
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)
print(f"22 (=) Log Loss: {log_loss(y_test, y_prob):.4f}")

# ===============================
# 23 (=): SVM Confusion Matrix with seaborn
# ===============================
import seaborn as sns
from sklearn.metrics import confusion_matrix

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("23 (=) SVM Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ===============================
# 24 (=): SVR with MAE
# ===============================
from sklearn.metrics import mean_absolute_error

housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=42)
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print(f"24 (=) SVR MAE: {mean_absolute_error(y_test, y_pred):.4f}")

# ===============================
# 25 (>=): Naïve Bayes with ROC-AUC
# ===============================
from sklearn.metrics import roc_auc_score

cancer = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]
print(f"25 (>=) ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")

# ===============================
# 26 (<=): SVM Precision-Recall Curve
# ===============================
from sklearn.metrics import precision_recall_curve

model = SVC(probability=True)
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob)
plt.plot(recall, precision, marker='.')
plt.title("26 (<=) SVM Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()
