<a href="https://colab.research.google.com/github/soumyamulgund/Agentic-AI-Application/blob/main/Classification_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification, make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Helper: Decision Boundary Plot

In [None]:
def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300)
    )

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k")
    plt.title(title)
    plt.show()


#Dummy 2D classification data

#General Purpose(Not Linearly Seperable)

In [None]:
X, y = make_classification(
    n_samples=200,
    n_features=2,
    n_redundant=0,
    n_clusters_per_class=1,
    class_sep=1.5,
    random_state=42
)

plt.scatter(X[:,0], X[:,1], c=y, edgecolors='k')
plt.title("Non Linearly Separable Dataset")
plt.show()

#Strictly Linearly Seperable

In [None]:
X_lin, y_lin = make_blobs(
    n_samples=200,
    centers=2,
    n_features=2,
    cluster_std=1.0,
    random_state=42
)

plt.scatter(X_lin[:,0], X_lin[:,1], c=y_lin, edgecolors='k')
plt.title("Linearly Separable Dataset (Perceptron & SVM)")
plt.show()


# Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X, y)

plot_decision_boundary(logreg, X, y, "Logistic Regression")


#Perceptron — FROM SCRATCH

In [None]:
class PerceptronScratch:
    def __init__(self, lr=0.1):
        self.lr = lr

    def fit(self, X, y, epochs=1):
        if not hasattr(self, "w"):
            self.w = np.zeros(X.shape[1])
            self.b = 0

        for _ in range(epochs):
            for xi, yi in zip(X, y):
                yi = 1 if yi == 1 else -1
                if yi * (np.dot(xi, self.w) + self.b) <= 0:
                    self.w += self.lr * yi * xi
                    self.b += self.lr * yi

    def predict(self, X):
        return (np.dot(X, self.w) + self.b >= 0).astype(int)


#Movement of Decision Boundary

In [None]:
perceptron = PerceptronScratch(lr=0.05)

for epoch in range(1, 6):
    perceptron.fit(X_lin, y_lin, epochs=1)
    plot_decision_boundary(
        perceptron,
        X_lin,
        y_lin,
        f"Perceptron – Epoch {epoch}"
    )


#SVM (sklearn) + Supporting Hyperplanes

In [None]:
svm = SVC(kernel="linear", C=1.0)
svm.fit(X_lin, y_lin)

plot_decision_boundary(
    svm,
    X_lin,
    y_lin,
    "SVM (Linear Kernel, Linearly Separable Data)"
)


#SVM SUPPORTING HYPERPLANES

In [None]:
w = svm.coef_[0]
b = svm.intercept_[0]

x_vals = np.linspace(X_lin[:, 0].min(), X_lin[:, 0].max(), 200)

plt.scatter(X_lin[:, 0], X_lin[:, 1], c=y_lin, edgecolors="k")
plt.plot(x_vals, -(w[0]*x_vals + b) / w[1], 'k', label="Decision Boundary")
plt.plot(x_vals, -(w[0]*x_vals + b - 1) / w[1], 'k--', label="Margin")
plt.plot(x_vals, -(w[0]*x_vals + b + 1) / w[1], 'k--')
plt.legend()
plt.title("SVM with Supporting Hyperplanes")
plt.show()


#k-NN (sklearn)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

plot_decision_boundary(knn, X, y, "k-NN (k=5)")


#Decision Tree (sklearn)

In [None]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X, y)

plot_decision_boundary(tree, X, y, "Decision Tree")


#Tree Diagram

In [None]:
plt.figure(figsize=(12, 6))
plot_tree(tree, filled=True, feature_names=["x1", "x2"])
plt.show()


#Random Forest (sklearn)

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
rf.fit(X, y)

plot_decision_boundary(rf, X, y, "Random Forest")


#Gradient Boosted Decision Trees (sklearn)

In [None]:
gbdt = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)
gbdt.fit(X, y)

plot_decision_boundary(gbdt, X, y, "Gradient Boosted Trees")


#XGBoost (sklearn)

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)
xgb_clf.fit(X, y)

plot_decision_boundary(xgb_clf, X, y, "XGBoost")

#***Unsupervised Learning***

#PCA DATASET

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

# Correlated 2D data
X_pca = np.random.multivariate_normal(
    mean=[0, 0],
    cov=[[3, 2],
         [2, 2]],
    size=300
)

plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.title("Original 2D Data (Correlated)")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()


#Applying PCA

In [None]:
# Standardize before PCA
X_pca_scaled = StandardScaler().fit_transform(X_pca)

pca = PCA(n_components=1)
X_reduced = pca.fit_transform(X_pca_scaled)

print("Explained variance ratio:", pca.explained_variance_ratio_)


#Visualize Principal Components

In [None]:
# Principal axis
pc = pca.components_[0]

plt.scatter(X_pca_scaled[:, 0], X_pca_scaled[:, 1], alpha=0.3)

# Plot principal direction
plt.arrow(
    0, 0,
    pc[0] * 3, pc[1] * 3,
    color='r', width=0.02
)

plt.title("PCA: Principal Component Direction")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()


#PCA (2D → 1D → BACK TO 2D)

In [None]:
pca = PCA(n_components=1)
X_pca_1d = pca.fit_transform(X_pca_scaled)

# Reconstruct back to 2D
X_reconstructed = pca.inverse_transform(X_pca_1d)


In [None]:
plt.scatter(
    X_pca_scaled[:, 0], X_pca_scaled[:, 1],
    alpha=0.3, label="Original"
)

plt.scatter(
    X_reconstructed[:, 0], X_reconstructed[:, 1],
    alpha=0.6, label="Reconstructed"
)

plt.legend()
plt.title("PCA Reconstruction (1D → 2D)")
plt.show()


In [None]:


# High-dimensional correlated data
X_hd, _ = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=8,
    n_redundant=12,
    random_state=42
)

# Standardize
X_hd_scaled = StandardScaler().fit_transform(X_hd)


In [None]:
pca = PCA()
pca.fit(X_hd_scaled)

explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)


In [None]:
plt.figure(figsize=(8,5))

plt.plot(
    range(1, len(cumulative_var) + 1),
    cumulative_var,
    marker='o',
    label="Cumulative Variance"
)

plt.axhline(0.85, color='r', linestyle='--', label='85% threshold')
plt.axhline(0.95, color='g', linestyle='--', label='95% threshold')

plt.xlabel("Number of Components")
plt.ylabel("Cumulative Variance Explained")
plt.title("Choosing the Number of PCA Components")

plt.legend()
plt.grid(True)
plt.show()


In [None]:
n_85 = np.argmax(cumulative_var >= 0.85) + 1
n_95 = np.argmax(cumulative_var >= 0.95) + 1

print(f"Components for 85% variance: {n_85}")
print(f"Components for 95% variance: {n_95}")


#Clustering Dataset

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X_cluster, _ = make_blobs(
    n_samples=300,
    centers=3,
    cluster_std=1.0,
    random_state=42
)

plt.scatter(X_cluster[:, 0], X_cluster[:, 1])
plt.title("Unlabeled Data for Clustering")
plt.show()


#Apply K-Means

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_cluster)


#VISUALIZE CLUSTERS & CENTROIDS

In [None]:
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=labels, cmap='viridis')
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    s=200,
    c='red',
    marker='X'
)
plt.title("K-Means Clustering (k=3)")
plt.show()


#Elbow Method

In [None]:
inertia = []

K_range = range(1, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster)
    inertia.append(kmeans.inertia_)

plt.plot(K_range, inertia, marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia (Within-cluster SSE)")
plt.title("Elbow Method for K-Means")
plt.show()
