# Hands-on Machine Learning with Python: Part-3

> **iris Dataset**


In [None]:
import warnings

warnings.filterwarnings("ignore")

# Import necessary libraries


In [None]:
# Essentials- Numerical Calculation and Data Manupulation libraries
import numpy as np
import pandas as pd

# Data Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sb

# Data preprocessing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA

# ML Models libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Ensemble Models  libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline

# Model Evaluation libraries
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc

# Load dataset


In [None]:
data = pd.read_csv("datasets/iris.csv")
# data.head()

# EDA (Exploratory Data Analysis)


In [None]:
print("Head of the dataset:")
data.head(2)

In [None]:
print(data.shape)

In [None]:
print(data.info())

In [None]:
data["species"].unique()
data["species"].unique().tolist()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

# Data Visualization


In [None]:
sb.countplot(x="species", data=data)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y = data["species"].value_counts().tolist()
mylabels = data["species"].unique().tolist()
myexplode = [0.01, 0.01, 0.01]

plt.pie(y, labels=mylabels, autopct="%1.2f%%", explode=myexplode)
plt.legend(title="Species:")
plt.title("Distribution of Passenger Sex in Titanic")

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot with color representing 'Survived' and size representing 'Pclass'

# sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=data, palette='Set2')
# sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', size='species', data=data, palette='Set2', sizes=(20, 200))

# sns.scatterplot(x='petal_length', y='petal_width', hue='species',  data=data, palette='Set2')
sns.scatterplot(
    x="petal_length",
    y="petal_width",
    hue="species",
    size="species",
    data=data,
    palette="Set2",
    sizes=(20, 200),
)

plt.title(
    "Scatter Plot of Petal length, Petal width, and Species (Size representsSspecies)"
)
plt.show()

In [None]:
sb.boxplot(x="species", y="petal_length", data=data)
# sb.boxplot(x='species',y='petal_width', data=data)

In [None]:
# sb.boxplot(x='species',y='sepal_width', data=data)
sb.boxplot(x="species", y="sepal_length", data=data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns
columns_to_visualize = [
    "sepal_length",
    "sepal_width",
    "petal_length",
    "petal_width",
    "species",
]

# Pair plot of selected columns
sns.pairplot(data[columns_to_visualize], hue="species", palette="Set1")
plt.suptitle("Pair Plot of sepal_length, sepal_width, petal_length, and petal_width")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns
columns_to_visualize = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# Calculate correlation matrix
correlation_matrix = data[columns_to_visualize].corr()

# Heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title(
    "Correlation Heatmap of sepal_length, sepal_width, petal_length, and petal_width"
)
plt.show()

# Data Pre-processing


> **Dividing into Feature Matrix and Feature Vector**


In [None]:
# Feature Matrix
x = data.drop("species", axis=1)
# Feature Vector
y = data["species"]

print(x.head())
print(y.head())

> **Feature Engineering**


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# data.species = le.fit_transform(data.species)
y = le.fit_transform(data.species)
y

> **Data Transformation or Normalization**

> _Feature Scaling : Standarization_


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
x = scaler.fit_transform(x)
print(x[:5,:])
print(x)
print(x.shape)
print(type(x))

> **Preprocessing: Train-test split**


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=123
)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Model building


In [None]:
# ML Models libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Ensemble Models  libraries
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Model Building
svm_model = make_pipeline(StandardScaler(), SVC(kernel="linear", probability=True))
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Model Training
svm_model.fit(x_train, y_train)
random_forest_model.fit(x_train, y_train)
# Model Prediction
y_pred = svm_model.predict(x_test)
# Model Accuracy
acc = accuracy_score(y_pred, y_test)
print(acc)
y_pred = random_forest_model.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(acc)

# Model Evaluation

> **Confusion matrix, accuracy, precision, recall, F1-score, ROC curve etc**


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

> **The Confusion Matrix**


In [None]:
cm = confusion_matrix(y_pred, y_test)
cm

In [None]:
# Create a Seaborn heatmap for visualization
sns.set(font_scale=1.2)
sns.heatmap(
    cm,
    annot=True,
    fmt="g",
    cmap="Blues",
    xticklabels=data["species"].unique().tolist(),
    yticklabels=data["species"].unique().tolist(),
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()

> **Classification Report**


In [None]:
cr = classification_report(y_pred, y_test)
print(cr)

> **Accuracy Score**


In [None]:
acc = accuracy_score(y_pred, y_test)
print(acc)

> **ROC Curve**


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier

# Assuming y_test and y_pred are arrays with multiple classes
# Convert them into binary format using label_binarize
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])  # Specify your class labels
y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_test_bin.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    plt.plot(
        fpr[i], tpr[i], lw=2, label=f"ROC curve for class {i} (AUC = {roc_auc[i]:.2f})"
    )

plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title(
    "Receiver Operating Characteristic (ROC) Curve for Multi-Class Classification"
)
plt.legend(loc="lower right")
plt.show()

# Improving Model Performance


# Improving Model Accuracy


**Ensemble - VotingClassifier**


In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Create individual classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_classifier = LogisticRegression(random_state=42)
svm_classifier = SVC(probability=True, random_state=42)
knn_classifier = KNeighborsClassifier()

# Create a list of tuples, where each tuple contains a name for the classifier and the classifier instance
classifiers = [
    ("Random Forest", rf_classifier),
    ("Logistic Regression", logistic_classifier),
    ("SVM", svm_classifier),
    ("KNN", knn_classifier),
]

# Create a VotingClassifier
voting_classifier = VotingClassifier(
    estimators=classifiers, voting="hard"
)  # You can use 'soft' for probabilities
# voting_classifier=VotingClassifier(estimators=classifiers, voting='soft')

# Now you can fit and use the voting_classifier as a regular classifier
voting_classifier.fit(x_train, y_train)
predictions = voting_classifier.predict(x_test)
print(accuracy_score(y_pred, y_test))

# Improving Model Reliability


**Simple 5 cross CV**

> _Cross-validation is a resampling procedure used to evaluate machine learning models by partitioning the dataset into multiple subsets, training the model on some subsets, and assessing its performance on the remaining subsets to obtain robust performance estimates._


In [None]:
# Cross-validation
svm_cv_scores = cross_val_score(svm_model, x_train, y_train, cv=5)
rf_cv_scores = cross_val_score(random_forest_model, x_train, y_train, cv=5)

print("\nCross-validation scores:")

print("SVM: ", svm_cv_scores)
print(f"Mean accuracy - SVM: {svm_cv_scores.mean():.2f}")

print("Random Forest: ", rf_cv_scores)
print(f"Mean accuracy - Random Forest: {rf_cv_scores.mean():.2f}")

**Stratified KFold( Stratified 10 Cross Validation)**

> **Hyper Parameter (cv) Tuning**

> Hyperparameters are external configuration settings that are not learned from the data but are set before the training process begins.

> Examples of hyperparameters include the learning rate, the number of hidden layers in a neural network, the number of trees in a random forest, or the regularization parameter in a support vector machine.

> Hyperparameter tuning is the process of finding the optimal set of hyperparameters for a machine learning model.

> They can significantly impact the performance of the model.


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier (replace this with your classifier of choice)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create StratifiedKFold with c=10 folds
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform stratified cross-validation
cv_results = cross_val_score(classifier, x, y, cv=stratified_kfold, scoring="accuracy")

# Print the results
print("Cross-validation results:")
print(cv_results)
print(f"Mean accuracy: {cv_results.mean():.2f}")

# Improvement Model Efficiency


> **PCA (Principal Component Analysis)**

> _Achieve Computational Efficiency through Feature Reduction_


In [None]:
pca = PCA(n_components=2)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [None]:
# Model building after PCA
svm_pca_model = SVC(kernel="linear", probability=True)
random_forest_pca_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit models after PCA
svm_pca_model.fit(x_train_pca, y_train)
random_forest_pca_model.fit(x_train_pca, y_train)

# Model Prediction
y_pred_svm = svm_model.predict(x_test)
y_pred_rf = random_forest_model.predict(x_test)

# Accuracy Score
acc_svm = accuracy_score(y_pred_svm, y_test)
print(f"SVM Accuracy {acc_svm:.2f}")
acc_rf = accuracy_score(y_pred_rf, y_test)
print("Random Forest Model Accuracy", acc_rf)

In [None]:
# Accuracy Score
acc_svm = accuracy_score(y_pred_svm, y_test)
print(f"SVM Accuracy {acc_svm:.2f}")
acc_rf = accuracy_score(y_pred_rf, y_test)
print("SVM Accuracy", acc_rf)

In [None]:
cr_svm = classification_report(y_pred_svm, y_test)
print(cr_svm)

In [None]:
cr_rfm = classification_report(y_pred_rf, y_test)
print(cr_rfm)

In [None]:
# Confusion Matrix
def plot_confusion_matrix(model, x_test, y_test, title):
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    # print(cm)
    plt.figure(figsize=(6, 4))
    # plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    sns.heatmap(cm, annot=True, fmt="g", cmap="Blues")

    plt.title(title)
    # plt.colorbar()

    classes = data["species"].unique().tolist()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes, rotation=45)

    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.show()


# Confusion Matrix for SVM after PCA
plot_confusion_matrix(
    svm_pca_model, x_test_pca, y_test, "Confusion Matrix - SVM after PCA"
)

# Confusion Matrix for Random Forest after PCA
plot_confusion_matrix(
    random_forest_pca_model,
    x_test_pca,
    y_test,
    "Confusion Matrix - Random Forest after PCA",
)

> **Cross Validation with PCA**


In [None]:
# Cross-validation
svm_cv_scores = cross_val_score(svm_model, x_train_pca, y_train, cv=5)
rf_cv_scores = cross_val_score(random_forest_model, x_train_pca, y_train, cv=5)

print("\nCross-validation scores:")

print("SVM: ", svm_cv_scores)
print(f"Mean accuracy - SVM: {svm_cv_scores.mean():.2f}")

print("Random Forest: ", rf_cv_scores)
print(f"Mean accuracy - Random Forest: {rf_cv_scores.mean():.2f}")