In [2]:
# This is the first stage of any ML project — loading the data.
# It can come from CSV, Excel, JSON, database, or an API.
# For now, we're using a built-in dataset just to keep things simple.

from sklearn.datasets import load_iris
iris = load_iris()
x = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
print("Feature names:", feature_names)
print("Target names:", target_names)
print(x[:5])

# As you can see in the output, each row represents a plant species with its different features.


feature name ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
feature target ['setosa' 'versicolor' 'virginica']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [6]:
# Now we split the data into training and testing sets.
# We've set 40% of the data for testing, so the remaining 60% is used for training.
# If you're unsure what ratio to use, techniques like cross-validation can help decide.

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# In the output, you'll see:
# - x_train has 90 rows and 4 columns. This means we're using 90 samples (each with 4 features) for training.
# - x_test has 60 rows and 4 columns — so 60 samples (each with 4 features) for testing.
# - y_train has 90 rows with just one column. These are the actual labels or targets the model should learn from.
# - y_test has 60 rows and one column — the actual target values the model will be tested on.
# Since `y` represents the output (or target), it doesn't need any features — it’s simply one value per sample, like a price, a category, or a score.


x-train shape (90, 4)
x-test shape (60, 4)
y-train shape (90,)
y-test shape (60,)


In [13]:
# 📌 Binary Classification Tutorial — Building & Understanding Models
# ----------------------------------------------------------
# This section shows how to train a simple binary classifier using Logistic Regression.
# Later, we’ll go over various models, when to use each, and real-world scenarios.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Training the Model
# -------------------
# Logistic Regression is a commonly used algorithm for binary classification problems.
# For example: spam detection, disease prediction, etc.
# It assumes a linear relationship between features and the log-odds of the output.

model = LogisticRegression()                # Step 1: Create the logistic regression model
model.fit(x_train, y_train)                 # Step 2: Train the model using training data

# Testing the Model
# -------------------
y_pred = model.predict(x_test)              # Step 3: Predict the labels for the test data

# Evaluating the Model
# ---------------------
accuracy = accuracy_score(y_test, y_pred)   # Step 4: Compare predicted vs actual test labels
print("The accuracy:", accuracy)

# 🎯 If accuracy is ~0.96 (or 96%), it means that 96% of test samples were correctly classified.
# That’s a strong result for a simple binary classifier.


The accuracy 0.9666666666666667


In [None]:
# 🔍 Choosing the Right Machine Learning Algorithm
# Each algorithm has a core intuition — a "way of thinking" about data.
# This guide captures the essence of each method and when it naturally fits.

# 🧠 Logistic Regression
# Essence: Finds the simplest boundary (a line or hyperplane) that separates classes.
# Best For: Linearly separable classes where speed and interpretability matter.
# Intuition: If the weighted sum of features crosses a threshold, it's class 1; else class 0.

# 🦉 Naive Bayes
# Essence: Uses probabilities assuming features are independent.
# Best For: Text classification like spam detection or sentiment analysis.
# Intuition: Multiply the probability of each feature given a class, pick the class with the highest product.

# 🌳 Decision Tree
# Essence: Asks a sequence of yes/no questions to split data based on most informative features.
# Best For: Problems with clear logical rules or non-linear relationships.
# Intuition: Recursively split the data using the best feature until pure or max depth.

# 🌲 Random Forest
# Essence: Combines many decision trees trained on different parts of the data and features.
# Best For: Tabular data with noise or potential overfitting.
# Intuition: Let many weak models vote; their average is more stable than one.

# 🌀 Support Vector Machine (SVM)
# Essence: Finds the widest possible margin between classes using only critical support vectors.
# Best For: High-dimensional data with clear margins — text, bioinformatics, etc.
# Intuition: Find the cleanest line (or hyperplane) that separates the classes with maximum margin.

# 👟 K-Nearest Neighbors (KNN)
# Essence: Classifies a point based on the majority label of its nearest neighbors.
# Best For: Small datasets or low-dimensional spaces where similarity matters.
# Intuition: No training. Just store the data and look around when a new point comes.

# 🧠 Neural Networks
# Essence: Learns layered representations of input by passing data through multiple transformations.
# Best For: Complex tasks like image recognition, speech, or language modeling on large datasets.
# Intuition: Each layer extracts higher-level features; like how brains learn abstract patterns.

# 💡 Tip:
# Start simple (like Logistic Regression or Naive Bayes).
# Move to more complex models (SVM, Random Forest, Neural Nets) only if performance demands it.
# Always validate using test data to avoid overfitting and ensure generalization.


In [20]:
# 🧠 Note:
# Although we have already trained and tested the model,
# here we are revisiting an earlier pipeline step (encoding) 
# to better understand how categorical data is converted into numeric form.
# This explanation is for learning purposes.

# 🔁 This is the second stage of the pipeline — encoding.
# In this step, we convert categorical data (like 'dog', 'cat', 'bird') into numerical form.
# LabelEncoder assigns numeric values based on label order, like: bird=0, cat=1, dog=2.
# But this imposes an artificial priority (dog > cat > bird), which may not make sense for many models.
# In such cases, we prefer OneHotEncoding instead — which we'll see after this.

from sklearn.preprocessing import LabelEncoder

categorical_data = ['dog', 'cat', 'cat', 'dog', 'bird']
encoder = LabelEncoder()
encoded = encoder.fit_transform(categorical_data)
print(encoded)  # Output: [2 1 1 2 0]


[2 1 1 2 0]


In [23]:
# 🧠 Continuing from the previous step:
# LabelEncoder gave us integer values, but those integers implied an order (e.g., dog > cat > bird),
# which isn't appropriate for most machine learning models.
# So instead, we use OneHotEncoder to represent categories *without* implying any order.

# 📌 OneHotEncoder creates a binary column for each category:
# For example:
# 'dog'  → [0. 0. 1.]
# 'cat'  → [0. 1. 0.]
# 'bird' → [1. 0. 0.]

from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Input categorical data
categorical_data = ['dog', 'cat', 'cat', 'dog', 'bird']

# Reshape the data into a 2D array as required by OneHotEncoder
categorical_data = np.array(categorical_data).reshape(-1, 1)

# Initialize the encoder (set sparse_output=False to get dense NumPy array)
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder and transform the data
encoded_features = encoder.fit_transform(categorical_data)

# Print the resulting one-hot encoded matrix
print("One-hot encoded features:\n", encoded_features)


One-hot encoded features:
 [[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [27]:
## 📌 Conclusion

#In this notebook, we built a simple binary classification pipeline using Logistic Regression. We covered essential preprocessing steps like label encoding, one-hot encoding, and model evaluation using accuracy.

#Thanks for reading! 😊  
#Feel free to ⭐️ the repo or leave feedback.


In [28]:
import os
os.getcwd()


'C:\\Users\\bowjo'

In [29]:
dir


<function dir>