# Handson Machine Learning with Python: Part-2

> **Presented by - Engr. Mohammad Mamun Hossain,**
> Assistant Professor, Dept. of CSE, BAUST.
> Email:mhossain@baust.edu.bd, Emergency Contact: 01717 690847


# Exploratory Data Analysis (EDA) on Titanic Dataset


## Ignore warnings


In [None]:
import warnings

warnings.filterwarnings("ignore")

## Import libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier


# Additional models

from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_curve

## Loading and Importing dataset


In [None]:
titanic_data = pd.read_csv("datasets/titanic.csv")

## EDA : Exploratory Data Analysis


In [None]:
print(titanic_data.shape)
print(titanic_data.index)
print(titanic_data.columns)
print(len(titanic_data.columns))
titanic_data.head()

## General info


In [None]:
titanic_data.info()

## Statistical info


In [None]:
titanic_data.describe()

## Specific column info


In [None]:
print(titanic_data["Age"].describe())

## Finding null values


In [None]:
titanic_data.isnull()

In [None]:
titanic_data.isnull().sum()

## Counting


In [None]:
titanic_data["Sex"].value_counts()

In [None]:
print(titanic_data["Sex"].unique().tolist())
print(titanic_data["Sex"].value_counts().tolist())

In [None]:
titanic_data["Embarked"].value_counts()

In [None]:
print(titanic_data["Embarked"].unique().tolist())
print(titanic_data["Sex"].value_counts().tolist())

# Data Visualization

with Matplotlib and Seaborn


## Bar Chart


In [None]:
sb.countplot(x="Survived", data=titanic_data)

In [None]:
sb.countplot(x="Sex", data=titanic_data)

In [None]:
sb.countplot(x="Survived", hue="Sex", data=titanic_data)

### Error Handled
The error message typically arises when you're trying to operate a string method (like startswith()) on a numeric type (like numpy.int64). This could happen if, for example, you're trying to treat numeric values as strings.

To resolve this issue, make sure that the 'Sex' and 'Survived' columns are of string or categorical data type. You can convert them to string explicitly using the astype() method if needed:

In [None]:
titanic_data['Sex'] = titanic_data['Sex'].astype(str)
titanic_data['Survived'] = titanic_data['Survived'].astype(str)
titanic_data['Pclass'] = titanic_data['Pclass'].astype(str)

In [None]:
plt.figure(figsize=(6, 4))
sb.countplot(x='Sex',hue='Survived',data=titanic_data)

In [None]:
sb.countplot(x="Pclass", data=titanic_data)

In [None]:
plt.figure(figsize=(6, 4))
sb.countplot(x="Survived", hue="Pclass", data=titanic_data)

In [None]:
sb.countplot(x="Pclass", hue="Survived", data=titanic_data)

In [None]:
plt.figure(figsize=(6, 4))
print(sb.countplot(x="Pclass", hue="Sex", data=titanic_data))

In [None]:
sb.countplot(x="Embarked", hue="Survived", data=titanic_data)

## Scatter plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot with color representing 'Survived' and size representing 'Pclass'
sns.scatterplot(
    x="Age",
    y="Fare",
    hue="Survived",
    size="Pclass",
    data=titanic_data,
    # palette="Set1",
    # sizes=(10, 100)
)
plt.title("Scatter Plot of Age, Fare, and Survival (Size represents Pclass)")
plt.show()

## Count plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming titanic_data is your DataFrame
# Replace 'Age' with the actual column name
# sns.set_theme(style="darkgrid")
# plt.figure(figsize=(10, 6))

# Use Seaborn countplot
sns.countplot(x="Age", data=titanic_data)

# Set labels and title
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Countplot of Age in Titanic Data")

# Show the plot
plt.show()

In [None]:
# sb.set_theme(style="darkgrid")
# plt.figure(figsize=(10, 6))
sb.countplot(x="Survived", data=titanic_data)

## Box plot

In [None]:
sb.boxplot(x="Sex", y="Age", data=titanic_data)

## Violin plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Violin plot with 'Age' across 'Pclass' and 'Survived'
sns.violinplot(
    x="Pclass", y="Age", hue="Survived", split=True, data=titanic_data, palette="muted"
)
plt.title("Violin Plot of Age across Pclass and Survived")
plt.show()

## Pie Plot


In [None]:
titanic_data["Sex"].value_counts()
titanic_data["Sex"].value_counts().tolist()

In [None]:
# titanic_data.Sex.unique()
titanic_data["Sex"].unique().tolist()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y = titanic_data["Sex"].value_counts().tolist()
mylabels = titanic_data["Sex"].unique().tolist()
myexplode = [0.2, 0]

plt.pie(y, labels=mylabels, autopct="%1.2f%%", explode=myexplode)
plt.legend(title="Sex:")
plt.title("Distribution of Passenger Sex in Titanic")

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y = titanic_data["Survived"].value_counts().tolist()
mylabels = titanic_data["Survived"].unique().tolist()
myexplode = [0.2, 0]

plt.pie(y, labels=mylabels, autopct="%1.2f%%")
plt.legend(title="Survived:")
plt.title("Distribution of Passenger Survival in Titanic")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y = titanic_data["Pclass"].value_counts().tolist()
mylabels = titanic_data["Pclass"].unique().tolist()
# myexplode = [0.2, 0]

plt.pie(y, labels=mylabels, autopct="%1.2f%%")
plt.legend(title="Pclass:")
plt.title("Distribution of Passenger Classes in Titanic")
plt.show()

## Pair Plot


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns
columns_to_visualize = ["Age", "Fare", "Pclass", "Survived"]

# Pair plot of selected columns
sns.pairplot(titanic_data[columns_to_visualize], hue="Survived", palette="Set1")
plt.suptitle("Pair Plot of Age, Fare, Pclass, and Survived")
plt.show()

## Heatmap


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns
columns_to_visualize = ["Age", "Fare", "Pclass", "Survived"]

# Calculate correlation matrix
correlation_matrix = titanic_data[columns_to_visualize].corr()

# Heatmap of the correlation matrix
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
sns.heatmap(correlation_matrix, annot=True)
plt.title("Correlation Heatmap of Age, Fare, Pclass, and Survived")
plt.show()

# Data PreProcessing

## Examining and Handeling Missing Values


In [None]:
import pandas as pd
titanic_data = pd.read_csv("datasets/titanic.csv")

In [None]:
titanic_data.head()

In [None]:
titanic_data.isnull().sum()

## No need column drop

inplace = true: modify the original dataset


In [None]:
titanic_data.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [None]:
print(titanic_data.shape)
print(titanic_data.index)
print(titanic_data.columns)
print(len(titanic_data.columns))
titanic_data.head()

In [None]:
titanic_data.isnull().sum()

## Filling Missing values


### Mean


In [None]:
mean = titanic_data["Age"].mean()
mean

In [None]:
titanic_data["Age"] = titanic_data["Age"].fillna(mean)
titanic_data.isnull().sum()

### Mode


In [None]:
mod = titanic_data["Embarked"].mode()[0]
mod

In [None]:
titanic_data["Embarked"] = titanic_data["Embarked"].fillna(mod)

titanic_data.isnull().sum()

## Feature Engineering


### Converting string to integer
le = LabelEncoder():
we create an instance of the LabelEncoder class and store it in the variable le. This instance will be used to transform the categorical variables into numerical ones.

titanic_data.Sex = le.fit_transform(titanic_data.Sex):
This line encodes the "Sex" column of the DataFrame titanic_data. The fit_transform() method of the LabelEncoder instance le fits the encoder to the unique values in the "Sex" column and then transforms those values into numerical representations. The transformed values are assigned back to the "Sex" column of the DataFrame.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# titanic_data.Embarked = le.fit_transform(titanic_data.Embarked) #  It is done in bellow two sections
titanic_data["Sex"] = le.fit_transform(titanic_data["Sex"])

titanic_data.head()

### One-Hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=cols)
This line applies one-hot encoding to the specified columns in the titanic_data DataFrame using pd.get_dummies(). It creates new binary (0 or 1) columns for each unique value in the specified column(s). The original "Embarked" column is replaced with these new columns representing each category present in the original column.

In [None]:
# cols = ["Embarked"]
titanic_data = pd.get_dummies(titanic_data, columns=["Embarked"])
titanic_data.head()

In [None]:
titanic_data["Embarked_C"] = le.fit_transform(titanic_data["Embarked_C"])
titanic_data["Embarked_Q"] = le.fit_transform(titanic_data["Embarked_Q"])
# titanic_data["Embarked_S"] = le.fit_transform(titanic_data["Embarked_S"])

# titanic_data["Embarked_C"] = titanic_data["Embarked_C"].astype(int)
# titanic_data["Embarked_Q"] = titanic_data["Embarked_Q"].astype(int)

titanic_data.head()

## Feature Selection
With Domain Expertise


In [None]:
titanic_data.drop(["Embarked_S", "PassengerId"], axis=1, inplace=True)
titanic_data.head()

## Train Test Split


In [None]:
y = titanic_data["Survived"]  # Feature Vector
print(y)
print(y.shape)

In [None]:
x = titanic_data.drop("Survived", axis=1)  # Feature Matrix
print(x)
print(x.shape)

## Standararization/Normalization


### Data Transformation

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(x)
print(x)
print(x.shape)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

# from sklearn.cross_validation import train_test_split # Depricated

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Model Building


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier

# Additional models
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

## Prediction

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred
print(accuracy_score(y_pred, y_test))

# Model Evaluation


## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cm = confusion_matrix(y_pred, y_test)
cm

## Heatmap

In [None]:
# Create a Seaborn heatmap for visualization
# sns.set(font_scale=1.2)
sns.heatmap(
    cm,
    annot=True,
    fmt="g",
    cmap="Blues",
    xticklabels=["Predicted 0", "Predicted 1"],
    yticklabels=["Actual 0", "Actual 1"],
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()

## Report

In [None]:
cr = classification_report(y_pred, y_test)
print(cr)

## AUC ROC

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Calculate the Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
# plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
# plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random")
# plt.xlabel("False Positive Rate (1 - Specificity)")
# plt.ylabel("True Positive Rate (Sensitivity)")
# plt.title("Receiver Operating Characteristic (ROC) Curve")
# plt.legend(loc="lower right")
plt.show()

# Model Improvement


## Cross Validation CV
When adjusting models we are aiming to increase overall model performance on unseen data. Hyperparameter tuning can lead to much better performance on test sets. However, optimizing parameters to the test set can lead information leakage causing the model to preform worse on unseen data. To correct for this we can perform cross validation.

Performing cross-validation on a dataset involves splitting the dataset into multiple subsets (folds), training the model on a subset of the data, and evaluating its performance on the remaining subset. This process is repeated multiple times, with different subsets used for training and evaluation each time.

<br>

Initialize Stratified K-Fold Cross-Validation:

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

Here, StratifiedKFold is initialized with 10 folds (n_splits=10). Setting shuffle=True shuffles the data before splitting, and random_state=42 ensures reproducibility.

Perform Stratified Cross-Validation:

cv_results = cross_val_score(classifier, X, y, cv=stratified_kfold, scoring="accuracy")

This line executes stratified cross-validation using cross_val_score. It takes the classifier (classifier), feature matrix (X), target vector (y), cross-validation strategy (cv=stratified_kfold), and scoring metric (scoring="accuracy").

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
# Create a RandomForestClassifier (replace this with your classifier of choice)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create StratifiedKFold with c=10 folds
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform stratified cross-validation
cv_results = cross_val_score(classifier, x, y, cv=stratified_kfold, scoring="accuracy")

# Print the results
print("Cross-validation results:")
print(cv_results)
print(f"Mean accuracy: {cv_results.mean():.2f}")

In [None]:
# Initialize an empty confusion matrix
overall_conf_matrix = np.zeros((2, 2))

# Perform stratified cross-validation and obtain predictions
for train_index, test_index in stratified_kfold.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    # Calculate confusion matrix for each fold
    fold_conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

    # Add the confusion matrix to the overall confusion matrix
    overall_conf_matrix += fold_conf_matrix

# Print the overall confusion matrix
print("Overall Confusion Matrix:")
print(overall_conf_matrix)

In [None]:
from sklearn.metrics import roc_curve, auc

# Initialize variables to store true labels and predicted probabilities
true_labels = []
predicted_probabilities = []
# Perform stratified cross-validation and obtain predicted probabilities
for train_index, test_index in stratified_kfold.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    classifier.fit(x_train, y_train)
    y_prob = classifier.predict_proba(x_test)[:, 1]

    true_labels.extend(y_test)
    predicted_probabilities.extend(y_prob)

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(true_labels, predicted_probabilities)


# Calculate the Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

## Ensemble Method

Ensemble methods are techniques that aim at improving the accuracy of results in models by combining multiple models instead of using a single model. The combined models increase the accuracy of the results significantly. This has boosted the popularity of ensemble methods in machine learning.

> Note

1. Bagging
Bagging, the short form for bootstrap aggregating, is mainly applied in classification and regression. It increases the accuracy of models through decision trees, which reduces variance to a large extent. The reduction of variance increases accuracy, eliminating overfitting, which is a challenge to many predictive models.

Bagging is classified into two types, i.e., bootstrapping and aggregation. Bootstrapping is a sampling technique where samples are derived from the whole population (set) using the replacement procedure. The sampling with replacement method helps make the selection procedure randomized. The base learning algorithm is run on the samples to complete the procedure.

Aggregation in bagging is done to incorporate all possible outcomes of the prediction and randomize the outcome. Without aggregation, predictions will not be accurate because all outcomes are not put into consideration. Therefore, the aggregation is based on the probability bootstrapping procedures or on the basis of all outcomes of the predictive models.

Bagging is advantageous since weak base learners are combined to form a single strong learner that is more stable than single learners. It also eliminates any variance, thereby reducing the overfitting of models. One limitation of bagging is that it is computationally expensive. Thus, it can lead to more bias in models when the proper procedure of bagging is ignored.

2. Boosting
Boosting is an ensemble technique that learns from previous predictor mistakes to make better predictions in the future. The technique combines several weak base learners to form one strong learner, thus significantly improving the predictability of models. Boosting works by arranging weak learners in a sequence, such that weak learners learn from the next learner in the sequence to create better predictive models.

Boosting takes many forms, including gradient boosting, Adaptive Boosting (AdaBoost), and XGBoost (Extreme Gradient Boosting). AdaBoost uses weak learners in the form of decision trees, which mostly include one split that is popularly known as decision stumps. AdaBoost’s main decision stump comprises observations carrying similar weights.

Gradient boosting adds predictors sequentially to the ensemble, where preceding predictors correct their successors, thereby increasing the model’s accuracy. New predictors are fit to counter the effects of errors in the previous predictors. The gradient of descent helps the gradient booster identify problems in learners’ predictions and counter them accordingly.

XGBoost makes use of decision trees with boosted gradient, providing improved speed and performance. It relies heavily on the computational speed and the performance of the target model. Model training should follow a sequence, thus making the implementation of gradient boosted machines slow.

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

> Note

VotingClassifier: This is a class in scikit-learn's ensemble module used for combining the predictions of multiple classifiers. It takes several base classifiers (estimators) and combines their individual predictions to make a final prediction.

estimators: This parameter specifies a list of (name, estimator) tuples, where each tuple contains the name of the classifier and the classifier instance. These classifiers are the individual models that will be combined by the voting classifier.

voting: This parameter specifies the type of voting strategy to use. When set to "hard", the predicted class labels are based on the majority vote among the classifiers. Alternatively, you can set it to "soft", where the predicted class labels are based on the average of predicted probabilities from the classifiers.

In [None]:
# Create individual classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_classifier = LogisticRegression(random_state=42)
svm_classifier = SVC(probability=True, random_state=42)
knn_classifier = KNeighborsClassifier()

# Create a list of tuples, where each tuple contains a name for the classifier and the classifier instance
classifiers = [
    ("Random Forest", rf_classifier),
    ("Logistic Regression", logistic_classifier),
    ("SVM", svm_classifier),
    ("KNN", knn_classifier),
]

# Create a VotingClassifier
# VotingClassifier(estimators=classifiers, voting='hard')
voting_classifier = VotingClassifier(
    estimators=classifiers, voting="hard"
)  # You can use 'soft' for probabilities

# Now you can fit and use the voting_classifier as a regular classifier
voting_classifier.fit(x_train, y_train)
predictions = voting_classifier.predict(x_test)
print(accuracy_score(y_pred, y_test))

## Principal Component Analysis PCA


In [None]:
from sklearn.decomposition import PCA

pca = PCA()               # All components
# pca = PCA(n_components=1)  # 1 components
# pca = PCA(n_components=2)  # 2 components
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
# print(pca)
print(x_train)

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(accuracy_score(y_pred, y_test))