# Exercise 4: Probability densities and data visualization

Objective: Upon completing this exercise it is expected that you:

• Understand the univariate and multivariate normal distribution.

• Get an understanding of the many ways data can be visualized including his-tograms, boxplots, and scatter plot

### 4.1 Unterstanding the Univariate and Multivariate Normal Distribution

#### 4.1.1 Generation of random numbers following standard distribution

In [None]:
# exercise 4.1.1

import numpy as np
from matplotlib.pyplot import figure, hist, plot, show, subplot, title

# Number of samples
N = 200

# Mean
mu = 17

# Standard deviation
s = 2

# Number of bins in histogram
nbins = 40

# Generate samples from the Normal distribution
X = np.random.normal(mu, s, N).T
# or equally:
X = np.random.randn(N).T * s + mu

# Plot the samples and histogram
figure(figsize=(12, 4))
title("Normal distribution")
subplot(1, 2, 1)
plot(X, ".")
subplot(1, 3, 3)
hist(X, bins=nbins)
show()

print("Ran Exercise 4.1.1")

#### 4.1.2 Comparison of theoretical to actual statistical measures

In [None]:
# exercise 4.1.2

import numpy as np
from matplotlib.pyplot import figure, hist, plot, show, subplot, title

# Number of samples
N = 200

# Mean
mu = 17

# Standard deviation
s = 2

# Number of bins in histogram
nbins = 20

# Generate samples from the Normal distribution
X = np.random.normal(mu, s, N).T
# or equally:
X = np.random.randn(N).T * s + mu

# Plot the samples and histogram
figure()
title("Normal distribution")
subplot(1, 2, 1)
plot(X, "x")
subplot(1, 2, 2)
hist(X, bins=nbins)

# Compute empirical mean and standard deviation
mu_ = X.mean()
s_ = X.std(ddof=1)

print("Theoretical mean: ", mu)
print("Theoretical std.dev.: ", s)
print("Empirical mean: ", mu_)
print("Empirical std.dev.: ", s_)

show()

print("Ran Exercise 4.1.2")

#### 4.1.3 theoretical probability distribution function

In [None]:
# exercise 4.1.3

import numpy as np
from matplotlib.pyplot import figure, hist, plot, show, subplot, title
from scipy import stats

# Number of samples
N = 500

# Mean
mu = 17

# Standard deviation
s = 2

# Number of bins in histogram
nbins = 20

# Generate samples from the Normal distribution
X = np.random.normal(mu, s, N).T
# or equally:
X = np.random.randn(N).T * s + mu

# Plot the histogram
f = figure()
title("Normal distribution")
hist(X, bins=nbins, density=True)

# Over the histogram, plot the theoretical probability distribution function:
x = np.linspace(X.min(), X.max(), 1000)
pdf = stats.norm.pdf(x, loc=17, scale=2)
plot(x, pdf, ".", color="red")

# Compute empirical mean and standard deviation
mu_ = X.mean()
s_ = X.std(ddof=1)

print("Theoretical mean: ", mu)
print("Theoretical std.dev.: ", s)
print("Empirical mean: ", mu_)
print("Empirical std.dev.: ", s_)

show()

print("Ran Exercise 4.1.3")

![image.png](attachment:image.png)

![image.png](attachment:image.png)

#### 4.1.4 

In [None]:
# exercise 4.1.4

import numpy as np

# Number of samples
N = 1000

# Mean
mu = np.array([13, 17])

# Covariance matrix
S = np.array([[4, 3], [3, 9]])

# Generate samples from the Normal distribution
X = np.random.multivariate_normal(mu, S, N)

print("Ran Exercise 4.1.4")

In [None]:
X.shape

#### 4.1.5 Correlation

##### Correlation between x1 and x2 = 0.5

In [None]:
# exercise 4.1.5

import numpy as np
from matplotlib.pyplot import (
    cm,
    colorbar,
    figure,
    hist,
    imshow,
    plot,
    show,
    subplot,
    suptitle,
    title,
    xlabel,
    xticks,
    ylabel,
    yticks,
)

# Number of samples
N = 1000

# Standard deviation of x1
s1 = 2

# Standard deviation of x2
s2 = 3

# Correlation between x1 and x2
corr = 0.5

# Covariance matrix
S = np.matrix([[s1 * s1, corr * s1 * s2], [corr * s1 * s2, s2 * s2]])

# Mean
mu = np.array([13, 17])

# Number of bins in histogram
nbins = 20

# Generate samples from multivariate normal distribution
X = np.random.multivariate_normal(mu, S, N)


# Plot scatter plot of data
figure(figsize=(12, 8))
suptitle("2-D Normal distribution")

subplot(1, 2, 1)
plot(X[:, 0], X[:, 1], "x")
xlabel("x1")
ylabel("x2")
title("Scatter plot of data")

subplot(1, 2, 2)
x = np.histogram2d(X[:, 0], X[:, 1], nbins)
imshow(x[0], cmap=cm.gray_r, interpolation="None", origin="lower")
colorbar()
xlabel("x1")
ylabel("x2")
xticks([])
yticks([])
title("2D histogram")

show()

print("Ran Exercise 4.1.5")

##### Correlation between x1 and x2 = 0

In [None]:
# exercise 4.1.5

import numpy as np
from matplotlib.pyplot import (
    cm,
    colorbar,
    figure,
    hist,
    imshow,
    plot,
    show,
    subplot,
    suptitle,
    title,
    xlabel,
    xticks,
    ylabel,
    yticks,
)

# Number of samples
N = 1000

# Standard deviation of x1
s1 = 2

# Standard deviation of x2
s2 = 3

# Correlation between x1 and x2
corr = 0

# Covariance matrix
S = np.matrix([[s1 * s1, corr * s1 * s2], [corr * s1 * s2, s2 * s2]])

# Mean
mu = np.array([13, 17])

# Number of bins in histogram
nbins = 20

# Generate samples from multivariate normal distribution
X = np.random.multivariate_normal(mu, S, N)


# Plot scatter plot of data
figure(figsize=(12, 8))
suptitle("2-D Normal distribution")

subplot(1, 2, 1)
plot(X[:, 0], X[:, 1], "x")
xlabel("x1")
ylabel("x2")
title("Scatter plot of data")

subplot(1, 2, 2)
x = np.histogram2d(X[:, 0], X[:, 1], nbins)
imshow(x[0], cmap=cm.gray_r, interpolation="None", origin="lower")
colorbar()
xlabel("x1")
ylabel("x2")
xticks([])
yticks([])
title("2D histogram")

show()

print("Ran Exercise 4.1.5")

Explanation:

When the correlation between x1 and x2 is zero, the two variables are independent. In a scatter plot, the points will be distributed in an axis-aligned elliptical shape, because each variable will vary around its mean, but there will be no linear relationship between them.

-> The zero correlation implies that x1 and x2 do not co-vary, so their joint distribution will be symmetric around each axis (if the means are zero), leading to an axis-aligned ellipse in the scatter plot.

##### Correlation between x1 and x2 = 1

In [None]:
# exercise 4.1.5

import numpy as np
from matplotlib.pyplot import (
    cm,
    colorbar,
    figure,
    hist,
    imshow,
    plot,
    show,
    subplot,
    suptitle,
    title,
    xlabel,
    xticks,
    ylabel,
    yticks,
)

# Number of samples
N = 1000

# Standard deviation of x1
s1 = 2

# Standard deviation of x2
s2 = 3

# Correlation between x1 and x2
corr = 1

# Covariance matrix
S = np.matrix([[s1 * s1, corr * s1 * s2], [corr * s1 * s2, s2 * s2]])

# Mean
mu = np.array([13, 17])

# Number of bins in histogram
nbins = 20

# Generate samples from multivariate normal distribution
X = np.random.multivariate_normal(mu, S, N)


# Plot scatter plot of data
figure(figsize=(12, 8))
suptitle("2-D Normal distribution")

subplot(1, 2, 1)
plot(X[:, 0], X[:, 1], "x")
xlabel("x1")
ylabel("x2")
title("Scatter plot of data")

subplot(1, 2, 2)
x = np.histogram2d(X[:, 0], X[:, 1], nbins)
imshow(x[0], cmap=cm.gray_r, interpolation="None", origin="lower")
colorbar()
xlabel("x1")
ylabel("x2")
xticks([])
yticks([])
title("2D histogram")

show()

print("Ran Exercise 4.1.5")

Explanation

When the correlation between x1 and x2 is one, there is a perfect linear relationship between the two variables. This means that x2 can be perfectly predicted by x1.

-> A correlation of one indicates that all the points lie on a straight line, as any change in x1 results in a proportional and exact change in x2. This relationship is represented by points forming a diagonal line on the scatter plot.

#### 4.1.6 Mean vs standard deviation

In [None]:
# exercise 4.1.6

import importlib_resources
import numpy as np
import scipy.linalg as linalg
from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks
from scipy.io import loadmat

filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
# Digits to include in analysis (to include all: n = range(10))
n = [3]

# Load Matlab data file to python dict structure
# and extract variables of interest
traindata = loadmat(filename)["traindata"]
X = traindata[:, 1:]
y = traindata[:, 0]
N, M = X.shape
C = len(n)

# Remove digits that are not to be inspected
class_mask = np.zeros(N).astype(bool)
for v in n:
    cmsk = y == v
    class_mask = class_mask | cmsk
X = X[class_mask, :]
y = y[class_mask]
N = np.shape(X)[0]

mu = X.mean(axis=0)
s = X.std(ddof=1, axis=0)
S = np.cov(X, rowvar=0, ddof=1)

figure()
subplot(1, 2, 1)
I = np.reshape(mu, (16, 16))
imshow(I, cmap=cm.gray_r)
title("Mean")
xticks([])
yticks([])
subplot(1, 2, 2)
I = np.reshape(s, (16, 16))
imshow(I, cmap=cm.gray_r)
title("Standard deviation")
xticks([])
yticks([])

show()

print("Ran Exercise 4.1.6")

### Explanation:

1. **Mean Image**:
   - The mean image represents the average intensity of each pixel across all images of the digit "1". As expected, it shows a digit-like shape that is blurred, as it is an aggregate of many handwritten samples. Since every instance of the digit is slightly different (due to variations in handwriting), the average image is a smoothed version of a typical "1".

2. **Standard Deviation Image**:
   - The standard deviation image shows the variability of the pixel intensities across different instances of the digit "1". The edges of the digit have the highest standard deviation because this is where handwriting variation is most pronounced. People write digits differently, causing the shape's boundaries to shift slightly, leading to higher variability along the edges.
   - The center of the digit tends to have lower standard deviation because this part of the digit is more consistently filled (or blank) across different samples.

### Why This Happens:
- **Higher variation along edges**: The edges of the digit have higher standard deviation because the exact boundaries of where the digit starts and ends vary from sample to sample. Some people write more slanted or curved digits, which shifts the edge positions.
- **Lower variation in the middle**: The middle of the digit often has either a solid fill or consistent absence of ink (depending on the digit), so there’s less variability in the pixel values in these areas.


#### 4.1.7

In [None]:
# exercise 4.1.7

import importlib_resources
import numpy as np
from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks
from scipy.io import loadmat

filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
# Digits to include in analysis (to include all, n = range(10) )
n = [7]

# Number of digits to generate from normal distributions
ngen = 10

# Load Matlab data file to python dict structure
# and extract variables of interest
traindata = loadmat(filename)["traindata"]
X = traindata[:, 1:]
y = traindata[:, 0]
N, M = np.shape(X)  # or X.shape
C = len(n)

# Remove digits that are not to be inspected
class_mask = np.zeros(N).astype(bool)
for v in n:
    cmsk = y == v
    class_mask = class_mask | cmsk
X = X[class_mask, :]
y = y[class_mask]
N = np.shape(X)[0]  # or X.shape[0]

mu = X.mean(axis=0)
s = X.std(ddof=1, axis=0)
S = np.cov(X, rowvar=0, ddof=1)

# Generate 10 samples from 1-D normal distribution
Xgen = np.random.randn(ngen, 256)
for i in range(ngen):
    Xgen[i] = np.multiply(Xgen[i], s) + mu

# Plot images
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: 1-D Normal")


# Generate 10 samples from multivariate normal distribution
Xmvgen = np.random.multivariate_normal(mu, S, ngen)
# Note if you are investigating a single class, then you may get:
# """RuntimeWarning: covariance is not positive-semidefinite."""
# Which in general is troublesome, but here is due to numerical imprecission


# Plot images
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xmvgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: Multivariate Normal")

show()

print("Ran Exercise 4.1.7")

### Why the Multivariate Normal Model Results in Better Images:

- **Correlated Pixels**: The multivariate normal distribution captures the correlations between pixels, which is essential for maintaining the structure of the digit. Neighboring pixels are often part of the same line or curve, and by accounting for these correlations, the generated images appear more coherent.

- **Global Structure**: The 1-D normal distribution treats each pixel independently, leading to noisy and disjointed images. In contrast, the multivariate normal model preserves the overall structure of the digit by modeling pixel relationships, resulting in clearer, more realistic images.

Thus, the multivariate normal model produces better results because it captures the inherent relationships between pixels, making the generated digits more recognizable.

#### 4.1.8 Varying the Number of Observations

In [None]:
import importlib_resources
import numpy as np
from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks
from scipy.io import loadmat

# Load Matlab data file to python dict structure and extract variables of interest
filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
traindata = loadmat(filename)["traindata"]
X = traindata[:, 1:]
y = traindata[:, 0]
N, M = np.shape(X)
n = [7]  # Digits to include in analysis (you can change this)
ngen = 10  # Number of digits to generate from normal distributions

# Remove digits that are not to be inspected
class_mask = np.zeros(N).astype(bool)
for v in n:
    cmsk = y == v
    class_mask = class_mask | cmsk
X = X[class_mask, :]
y = y[class_mask]
N = np.shape(X)[0]

# Set number of observations to use for mean and covariance
# Change this value to try different numbers of observations
n_observations = 1000  # Use only first 50 samples, for example
X_subsample = X[:n_observations, :]

# Estimate mean and covariance from a subset of observations
mu = X_subsample.mean(axis=0)
s = X_subsample.std(ddof=1, axis=0)
S = np.cov(X_subsample, rowvar=0, ddof=1)

# Generate 10 samples from 1-D normal distribution
Xgen = np.random.randn(ngen, 256)
for i in range(ngen):
    Xgen[i] = np.multiply(Xgen[i], s) + mu

# Plot images for 1-D normal
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: 1-D Normal")

# Generate 10 samples from multivariate normal distribution
Xmvgen = np.random.multivariate_normal(mu, S, ngen)

# Plot images for multivariate normal
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xmvgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: Multivariate Normal")

show()

print("Exercise 4.1.8 with", n_observations, "observations used")

In [None]:
import importlib_resources
import numpy as np
from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks
from scipy.io import loadmat

# Load Matlab data file to python dict structure and extract variables of interest
filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
traindata = loadmat(filename)["traindata"]
X = traindata[:, 1:]
y = traindata[:, 0]
N, M = np.shape(X)
n = [7]  # Digits to include in analysis (you can change this)
ngen = 10  # Number of digits to generate from normal distributions

# Remove digits that are not to be inspected
class_mask = np.zeros(N).astype(bool)
for v in n:
    cmsk = y == v
    class_mask = class_mask | cmsk
X = X[class_mask, :]
y = y[class_mask]
N = np.shape(X)[0]

# Set number of observations to use for mean and covariance
# Change this value to try different numbers of observations
n_observations = 200  # Use only first 50 samples, for example
X_subsample = X[:n_observations, :]

# Estimate mean and covariance from a subset of observations
mu = X_subsample.mean(axis=0)
s = X_subsample.std(ddof=1, axis=0)
S = np.cov(X_subsample, rowvar=0, ddof=1)

# Generate 10 samples from 1-D normal distribution
Xgen = np.random.randn(ngen, 256)
for i in range(ngen):
    Xgen[i] = np.multiply(Xgen[i], s) + mu

# Plot images for 1-D normal
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: 1-D Normal")

# Generate 10 samples from multivariate normal distribution
Xmvgen = np.random.multivariate_normal(mu, S, ngen)

# Plot images for multivariate normal
figure()
for k in range(ngen):
    subplot(2, int(np.ceil(ngen / 2.0)), k + 1)
    I = np.reshape(Xmvgen[k, :], (16, 16))
    imshow(I, cmap=cm.gray_r)
    xticks([])
    yticks([])
    if k == 1:
        title("Digits: Multivariate Normal")

show()

print("Exercise 4.1.8 with", n_observations, "observations used")

By varying the number of observations used to estimate the mean and covariance, we can observe changes in the quality of the generated digits. When using a smaller number of observations (e.g., 50), the mean and covariance may not fully capture the variability and structure of the digit, leading to lower-quality generated images. 

As the number of observations increases, the estimates become more accurate, resulting in better representations of the digits in both the 1-D and multivariate normal cases.

In particular:
- **Few observations**: The generated images may appear more noisy or distorted due to insufficient data to accurately estimate pixel relationships.
- **More observations**: The quality improves as the mean and covariance estimates become more reliable, capturing the inherent structure of the digits more accurately.


### 4.2 Visualizing Fisher’s Iris dat

In [None]:
# exercise 4.2.1

import importlib_resources
import numpy as np
import xlrd

filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls")
# Load xls sheet with data
doc = xlrd.open_workbook(filename).sheet_by_index(0)

# Extract attribute names
attributeNames = doc.row_values(0, 0, 4)

# Extract class names to python list,
# then encode with integers (dict)
classLabels = doc.col_values(4, 1, 151)
classNames = sorted(set(classLabels))
classDict = dict(zip(classNames, range(len(classNames))))

# Extract vector y, convert to NumPxqy matrix and transpose
y = np.array([classDict[value] for value in classLabels])

# Preallocate memory, then extract data to matrix X
X = np.empty((150, 4))
for i in range(4):
    X[:, i] = np.array(doc.col_values(i, 1, 151)).T

# Compute values of N, M and C.
N = len(y)
M = len(attributeNames)
C = len(classNames)

print("Ran Exercise 4.2.1")

In [None]:
# Exercise 4.2.2

import numpy as np


from matplotlib.pyplot import figure, hist, show, subplot, xlabel, ylim

figure(figsize=(8, 7))
u = np.floor(np.sqrt(M))
v = np.ceil(float(M) / u)
for i in range(M):
    subplot(int(u), int(v), i + 1)
    hist(X[:, i], color=(0.2, 0.8 - i * 0.2, 0.4))
    xlabel(attributeNames[i])
    ylim(0, N / 2)

show()

print("Ran Exercise 4.2.2")

In [None]:
# Exercise 4.2.3

# requires data from exercise 4.2.1
from matplotlib.pyplot import boxplot, show, title, xticks, ylabel

boxplot(X)
xticks(range(1, 5), attributeNames)
ylabel("cm")
title("Fisher's Iris data set - boxplot")
show()

print("Ran Exercise 4.2.3")

In [None]:
# Exercise 4.2.4
# requires data from exercise 4.1.1
from matplotlib.pyplot import boxplot, figure, show, subplot, title, xticks, ylim

figure(figsize=(14, 7))
for c in range(C):
    subplot(1, C, c + 1)
    class_mask = y == c  # binary mask to extract elements of class c
    # or: class_mask = nonzero(y==c)[0].tolist()[0] # indices of class c

    boxplot(X[class_mask, :])
    # title('Class: {0}'.format(classNames[c]))
    title("Class: " + classNames[c])
    xticks(
        range(1, len(attributeNames) + 1), [a[:7] for a in attributeNames], rotation=45
    )
    y_up = X.max() + (X.max() - X.min()) * 0.1
    y_down = X.min() - (X.max() - X.min()) * 0.1
    ylim(y_down, y_up)

show()

print("Ran Exercise 4.2.4")


In [None]:
# Exercise 4.2.5

from matplotlib.pyplot import (
    figure,
    legend,
    plot,
    show,
    subplot,
    xlabel,
    xticks,
    ylabel,
    yticks,
)

figure(figsize=(12, 10))
for m1 in range(M):
    for m2 in range(M):
        subplot(M, M, m1 * M + m2 + 1)
        for c in range(C):
            class_mask = y == c
            plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), ".")
            if m1 == M - 1:
                xlabel(attributeNames[m2])
            else:
                xticks([])
            if m2 == 0:
                ylabel(attributeNames[m1])
            else:
                yticks([])
            # ylim(0,X.max()*1.1)
            # xlim(0,X.max()*1.1)
legend(classNames)

show()

print("Ran Exercise 4.2.5")

In [None]:
# Exercise 4.2.6

# requires data from exercise 4.1.1
from matplotlib.pyplot import figure, show
from mpl_toolkits.mplot3d import Axes3D

# Indices of the variables to plot
ind = [0, 1, 2]
colors = ["blue", "green", "red"]

f = figure()
ax = f.add_subplot(111, projection="3d")  # Here the mpl_toolkits is used
for c in range(C):
    class_mask = y == c
    s = ax.scatter(
        X[class_mask, ind[0]], X[class_mask, ind[1]], X[class_mask, ind[2]], c=colors[c]
    )

ax.view_init(30, 220)
ax.set_xlabel(attributeNames[ind[0]])
ax.set_ylabel(attributeNames[ind[1]])
ax.set_zlabel(attributeNames[ind[2]])

show()

print("Ran Exercise 4.2.6")


In [None]:
# Exercise 4.2.7

from matplotlib.pyplot import (
    cm,
    colorbar,
    figure,
    imshow,
    show,
    title,
    xlabel,
    xticks,
    ylabel,
)
from scipy.stats import zscore

X_standarized = zscore(X, ddof=1)

figure(figsize=(12, 6))
imshow(X_standarized, interpolation="none", aspect=(4.0 / N), cmap=cm.gray)
xticks(range(4), attributeNames)
xlabel("Attributes")
ylabel("Data objects")
title("Fisher's Iris data matrix")
colorbar()

show()

print("Ran Exercise 4.2.7")

### 4.3 Visualizing Wine Data

In [None]:
# exercise 4.3.1

import importlib_resources
import numpy as np
from matplotlib.pyplot import (
    boxplot,
    figure,
    hist,
    show,
    subplot,
    title,
    xlabel,
    xticks,
    ylim,
    yticks,
)
from scipy.io import loadmat
from scipy.stats import zscore

filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")

# Load Matlab data file and extract variables of interest
mat_data = loadmat(filename)
X = mat_data["X"]
y = mat_data["y"].squeeze()
C = mat_data["C"][0, 0]
M = mat_data["M"][0, 0]
N = mat_data["N"][0, 0]
attributeNames = [name[0][0] for name in mat_data["attributeNames"]]
classNames = [cls[0][0] for cls in mat_data["classNames"]]

# We start with a box plot of each attribute
figure()
title("Wine: Boxplot")
boxplot(X)
xticks(range(1, M + 1), attributeNames, rotation=45)

# From this it is clear that there are some outliers in the Alcohol
# attribute (10x10^14 is clearly not a proper value for alcohol content)
# However, it is impossible to see the distribution of the data, because
# the axis is dominated by these extreme outliers. To avoid this, we plot a
# box plot of standardized data (using the zscore function).
figure(figsize=(12, 6))
title("Wine: Boxplot (standarized)")
boxplot(zscore(X, ddof=1), attributeNames)
xticks(range(1, M + 1), attributeNames, rotation=45)

# This plot reveals that there are clearly some outliers in the Volatile
# acidity, Density, and Alcohol attributes, i.e. attribute number 2, 8,
# and 11.

# Next, we plot histograms of all attributes.
figure(figsize=(14, 9))
u = np.floor(np.sqrt(M))
v = np.ceil(float(M) / u)
for i in range(M):
    subplot(int(u), int(v), i + 1)
    hist(X[:, i])
    xlabel(attributeNames[i])
    ylim(0, N)  # Make the y-axes equal for improved readability
    if i % v != 0:
        yticks([])
    if i == 0:
        title("Wine: Histogram")


# This confirms our belief about outliers in attributes 2, 8, and 11.
# To take a closer look at this, we next plot histograms of the
# attributes we suspect contains outliers
figure(figsize=(14, 9))
m = [1, 7, 10]
for i in range(len(m)):
    subplot(1, len(m), i + 1)
    hist(X[:, m[i]], 50)
    xlabel(attributeNames[m[i]])
    ylim(0, N)  # Make the y-axes equal for improved readability
    if i > 0:
        yticks([])
    if i == 0:
        title("Wine: Histogram (selected attributes)")


# The histograms show that there are a few very extreme values in these
# three attributes. To identify these values as outliers, we must use our
# knowledge about the data set and the attributes. Say we expect volatide
# acidity to be around 0-2 g/dm^3, density to be close to 1 g/cm^3, and
# alcohol percentage to be somewhere between 5-20 % vol. Then we can safely
# identify the following outliers, which are a factor of 10 greater than
# the largest we expect.
outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
valid_mask = np.logical_not(outlier_mask)

# Finally we will remove these from the data set
X = X[valid_mask, :]
y = y[valid_mask]
N = len(y)


# Now, we can repeat the process to see if there are any more outliers
# present in the data. We take a look at a histogram of all attributes:
figure(figsize=(14, 9))
u = np.floor(np.sqrt(M))
v = np.ceil(float(M) / u)
for i in range(M):
    subplot(int(u), int(v), i + 1)
    hist(X[:, i])
    xlabel(attributeNames[i])
    ylim(0, N)  # Make the y-axes equal for improved readability
    if i % v != 0:
        yticks([])
    if i == 0:
        title("Wine: Histogram (after outlier detection)")

# This reveals no further outliers, and we conclude that all outliers have
# been detected and removed.

show()

print("Ran Exercise 4.3.1")

In [None]:
# exercise 4.3.2

import importlib_resources
import numpy as np
from matplotlib.pyplot import (
    figure,
    legend,
    plot,
    show,
    subplot,
    xlabel,
    xticks,
    ylabel,
    yticks,
)
from scipy.io import loadmat
from scipy.stats import zscore

filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")

# Load Matlab data file and extract variables of interest
mat_data = loadmat(filename)
X = mat_data["X"]
y = np.squeeze(mat_data["y"])
C = mat_data["C"][0, 0]
M = mat_data["M"][0, 0]
N = mat_data["N"][0, 0]

attributeNames = [name[0][0] for name in mat_data["attributeNames"]]
classNames = [cls[0] for cls in mat_data["classNames"][0]]

# The histograms show that there are a few very extreme values in these
# three attributes. To identify these values as outliers, we must use our
# knowledge about the data set and the attributes. Say we expect volatide
# acidity to be around 0-2 g/dm^3, density to be close to 1 g/cm^3, and
# alcohol percentage to be somewhere between 5-20 % vol. Then we can safely
# identify the following outliers, which are a factor of 10 greater than
# the largest we expect.
outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
valid_mask = np.logical_not(outlier_mask)

# Finally we will remove these from the data set
X = X[valid_mask, :]
y = y[valid_mask]
N = len(y)
Xnorm = zscore(X, ddof=1)

## Next we plot a number of atttributes
Attributes = [1, 4, 5, 6]
NumAtr = len(Attributes)

figure(figsize=(12, 12))
for m1 in range(NumAtr):
    for m2 in range(NumAtr):
        subplot(NumAtr, NumAtr, m1 * NumAtr + m2 + 1)
        for c in range(C):
            class_mask = y == c
            plot(X[class_mask, Attributes[m2]], X[class_mask, Attributes[m1]], ".")
            if m1 == NumAtr - 1:
                xlabel(attributeNames[Attributes[m2]])
            else:
                xticks([])
            if m2 == 0:
                ylabel(attributeNames[Attributes[m1]])
            else:
                yticks([])
            # ylim(0,X.max()*1.1)
            # xlim(0,X.max()*1.1)
legend(classNames)
show()

print("Ran Exercise 4.3.2")