# Exercise 1: NumPy array Indexing/Slicing

**Ex 1.1:** Load the "iris.csv" using the appropriate method for this file type (use the new functions from the package)

In [1]:
import pandas as pd

iris_df = pd.read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\iris\iris.csv')
print(iris_df.head())

   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


**Ex 1.2:** Select the penultimate independent variable. What is the dimension of the resulting array?

In [2]:
penultimate_variable = iris_df.iloc[:, -2]
print("Dimension of the resulting array:", penultimate_variable.shape)

Dimension of the resulting array: (150,)


**Ex 1.3:** Select the last 10 samples from the iris dataset. What is the mean of the last 10 samples for each
independent variable/feature?

In [3]:
last10 = iris_df.iloc[-10:, :-1]
mean_last10 = last10.mean()
print("Mean of the last 10 samples for each independent variable/feature:\n", mean_last10)

Mean of the last 10 samples for each independent variable/feature:
 sepal_length    6.45
sepal_width     3.03
petal_length    5.33
petal_width     2.17
dtype: float64


**Ex 1.4:** Select all samples from the dataset with values less than or equal to 6 for all independent variables/features. How many samples do you obtain?

In [4]:
filtered_samples = iris_df[(iris_df.iloc[:, :-1] <= 6).all(axis=1)]
num_samples = filtered_samples.shape[0]
print("Number of samples with values less than or equal to 6 for all independent variables/features:", num_samples)

Number of samples with values less than or equal to 6 for all independent variables/features: 89


**Ex 1.5:** Select all samples with a class/label different from 'Iris-setosa'. How many samples do you obtain?

In [5]:
samples = iris_df[iris_df['class'] != 'Iris-setosa']
num_samples = samples = iris_df[iris_df['class'] != 'Iris-setosa'].shape[0]
print("Number of samples with a class/label different from 'Iris-setosa':", num_samples)

Number of samples with a class/label different from 'Iris-setosa': 100


# **Exercise 2:** 

Examples of how to use the fillna, dropna and remove_by_index methods

In [6]:
import numpy as np
from si.data.dataset import Dataset

In [7]:
#Turning the iris dataset into a Dataset object and adding a row with NaN values
X = last10.values
new_row = np.array([np.nan, 3., 5.6, np.nan])
X = np.vstack([X, new_row])
y = iris_df.iloc[-10:, -1].values
new_y = np.array(['Iris-setosa'])
y = np.append(y, new_y)
dataset = Dataset(X, y)

print("Original Dataset:")
print("X:", dataset.X)
print("y:", dataset.y)

Original Dataset:
X: [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
y: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [8]:
if np.isnan(dataset.X).any() :
    print("The dataset contains missing values.")

The dataset contains missing values.


In [9]:
#Making a copy of the dataset
X_copy = np.copy(dataset.X)
y_copy = np.copy(dataset.y) 
dataset_copy = Dataset(X_copy, y_copy)

print("Dataset copy: \n", dataset_copy.X)
print(dataset_copy.y)



# Removing samples with missing values using the dropna method
cleaned_dataset = dataset_copy.dropna()
print("\nSize of X before dropna:", len(dataset_copy.X))
print("Size of y before dropna:", len(dataset_copy.y))

print("\nSize of X after dropna:\n", len(cleaned_dataset.X))
print("Size of y after dropna:\n", len(cleaned_dataset.y))

print("\nX after dropna:\n", cleaned_dataset.X)
print("y after dropna:", cleaned_dataset.y)

Dataset copy: 
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']

Size of X before dropna: 10
Size of y before dropna: 10

Size of X after dropna:
 10
Size of y after dropna:
 10

X after dropna:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after dropna: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


In [10]:
# Filling missing values with the mean of the corresponding feature using the fillna method
filled_dataset = dataset.fillna(1111.0)
print("X after fillna:\n", filled_dataset.X)
print("y after fillna:\n", filled_dataset.y)


X after fillna:
 [[   6.7    3.1    5.6    2.4]
 [   6.9    3.1    5.1    2.3]
 [   5.8    2.7    5.1    1.9]
 [   6.8    3.2    5.9    2.3]
 [   6.7    3.3    5.7    2.5]
 [   6.7    3.     5.2    2.3]
 [   6.3    2.5    5.     1.9]
 [   6.5    3.     5.2    2. ]
 [   6.2    3.4    5.4    2.3]
 [   5.9    3.     5.1    1.8]
 [1111.     3.     5.6 1111. ]]
y after fillna:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [11]:
# Removing the last sample using the remove_by_index method
removed_dataset = filled_dataset.remove_by_index(-1)
print("Size of the modified dataset:", dataset.X.shape)
print("X after remove_by_index:\n", removed_dataset.X)
print("y after remove_by_index:\n", removed_dataset.y)


Size of the modified dataset: (10, 4)
X after remove_by_index:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after remove_by_index:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


# **Exercise 3:** Implementing SelectPercentile

Testing the SelectPercentile class using the "iris.csv" dataset 

- This class allows for the selection of features according to a given percentile, therefore we will see if the sizes of the datasets and features of the new dataframe correspond to the expected.
- Expected results: in a dataset with a total number of 4 features such as the iris.cvs, with a percentile of 25% and 50% there should be a selection of 1 and 2 features respectively. Additionally the number of samples must not change and the selection should be done considering the highest values of F.

In [12]:
from si.io.csv_file import read_csv
from si.statistics.f_classification import f_classification
from si.feature_selection.select_percentile import SelectPercentile

In [13]:
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)

selectors = [ SelectPercentile(percentile=25), SelectPercentile(percentile=50.0), SelectPercentile(percentile=100) ]

print("Original dataset shape:", iris_dataset.X.shape)
print("Original number of features:", len(iris_dataset.features))
for selector in selectors:
    selector.fit(iris_dataset)
    transformed_dataset = selector.transform(iris_dataset)
    print("\nSelector percentile:", selector.percentile)
    print("Transformed dataset shape:", transformed_dataset.X.shape)
    print("Selected features:", transformed_dataset.features)


Original dataset shape: (150, 4)
Original number of features: 4

Selector percentile: 25
Transformed dataset shape: (150, 1)
Selected features: ['petal_length']

Selector percentile: 50.0
Transformed dataset shape: (150, 2)
Selected features: ['petal_width', 'petal_length']

Selector percentile: 100
Transformed dataset shape: (150, 4)
Selected features: ['sepal_width', 'sepal_length', 'petal_width', 'petal_length']


In [14]:
print("Original features:", iris_dataset.features)
for selector in selectors:
    selector.fit(iris_dataset)
    transformed_dataset = selector.transform(iris_dataset)
    print("Selector percentile:", selector.percentile)
    print("F values:", selector.F)
    print("p values:", selector.p)

Original features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
Selector percentile: 25
F values: [ 119.26450218   47.3644614  1179.0343277   959.32440573]
p values: [1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]
Selector percentile: 50.0
F values: [ 119.26450218   47.3644614  1179.0343277   959.32440573]
p values: [1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]
Selector percentile: 100
F values: [ 119.26450218   47.3644614  1179.0343277   959.32440573]
p values: [1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]


As expected the features with the highest F values (smallest p values) were the ones selected in the selectors with the different percentiles.


# **Exercise 4:** Cosine Distance

Comparison with the already existent cosine_distances from the sklearn

In [15]:
from si.statistics.cosine_distance import cosine_distance
x = np.array([1, 2, 3])
y = np.array([[1, 2, 3], [4, 5, 6]])
distance = cosine_distance(x, y)

from sklearn.metrics.pairwise import cosine_distances
sklearn_distance = cosine_distances(x.reshape(1, -1), y)

print("Cosine implementation:", distance)
print("Sklearn function:", sklearn_distance)

Cosine implementation: [0.         0.02536815]
Sklearn function: [[0.         0.02536815]]


# **Exercício 5:** PCA

Testing the PCA class in a jupyter notebook using the iris.csv dataset.

In [17]:
from si.decomposition.pca import PCA
from sklearn.decomposition import PCA as PCA_sklearn

In [18]:
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)

In [19]:
# PCA with 2 components using PCA class
np.random.seed(5)
pca = PCA(n_components=2)
pca._fit(iris_dataset.X)
X_transformed = pca._transform(iris_dataset.X)


In [20]:
# PCA with 2 components using PCA class from scikit-learn
np.random.seed(5)
pca_sklearn = PCA_sklearn(n_components=2)
pca_sklearn.fit(iris_dataset.X)
X_transformed_sklearn = pca_sklearn.transform(iris_dataset.X)

In [21]:
print("Explained variance:", pca.explained_variance)
print("Explained variance by PCA from scikit-learn:", pca_sklearn.explained_variance_)

print("\nTransformed data structure:", X_transformed.shape)
print("Transformed data structure by PCA from scikit-learn:", X_transformed_sklearn.shape)

print ("\nFirst five lines of the Transformed data:\n", X_transformed[:5])
print("First five lines of the Transformed data by PCA from scikit-learn:\n", X_transformed_sklearn[:5])

Explained variance: [0.92461621 0.05301557]
Explained variance by PCA from scikit-learn: [4.22484077 0.24224357]

Transformed data structure: (150, 2)
Transformed data structure by PCA from scikit-learn: (150, 2)

First five lines of the Transformed data:
 [[-2.68420713 -0.32660731]
 [-2.71539062  0.16955685]
 [-2.88981954  0.13734561]
 [-2.7464372   0.31112432]
 [-2.72859298 -0.33392456]]
First five lines of the Transformed data by PCA from scikit-learn:
 [[-2.68420713  0.32660731]
 [-2.71539062 -0.16955685]
 [-2.88981954 -0.13734561]
 [-2.7464372  -0.31112432]
 [-2.72859298  0.33392456]]


In [22]:
# PCA with 2 components using PCA class and normalizing the data
pca_norm = PCA(n_components=2)
pca_norm._fit(iris_dataset.X, normalization=True)
X_transformed_norm = pca_norm._transform(iris_dataset.X, normalization=True)

In [23]:
# PCA with 2 components using PCA class and normalizing the data using the StandardScaler
from sklearn.preprocessing import StandardScaler 
np.random.seed(5)
X_scaled = StandardScaler().fit_transform(iris_dataset.X)
pca_scaled = PCA(n_components=2)
pca_scaled._fit(X_scaled)
X_transformed_scaled = pca_scaled._transform(X_scaled)

In [24]:
print("Explained variance:", pca_norm.explained_variance)
print("Explained variance using the StandardScaler:", pca_scaled.explained_variance)

print("\nTransformed data structure (normalized):", X_transformed_norm.shape)
print("Transformed data structure using the StandardScaler:", X_transformed_scaled.shape)

print ("\nFirst five lines of the Transformed data (normalized):\n", X_transformed_norm[:5])
print("First five lines of the Transformed data using the StandardScaler:\n", X_transformed_scaled[:5])

Explained variance: [0.72770452 0.23030523]
Explained variance using the StandardScaler: [0.72770452 0.23030523]

Transformed data structure (normalized): (150, 2)
Transformed data structure using the StandardScaler: (150, 2)

First five lines of the Transformed data (normalized):
 [[-2.44159388 -0.02095745]
 [-2.41439075  0.51628447]
 [-2.62966145  0.40774632]
 [-2.53931232  0.53331485]
 [-2.52016653 -0.07628126]]
First five lines of the Transformed data using the StandardScaler:
 [[-2.26454173 -0.5057039 ]
 [-2.0864255   0.65540473]
 [-2.36795045  0.31847731]
 [-2.30419716  0.57536771]
 [-2.38877749 -0.6747674 ]]


# **Exercise 6:** Implementing stratified splitting

Test the "stratified_train_test_split" function with the iris dataset

In [25]:
from si.model_selection.split import train_test_split, stratified_train_test_split

In [26]:
train, test = train_test_split(iris_dataset)
train_strat, test_strat = stratified_train_test_split(iris_dataset)
train_strat_50, test_strat_50 = stratified_train_test_split(iris_dataset, test_size=0.5)
train_strat_seed, test_strat_seed= stratified_train_test_split(iris_dataset, random_state=5)

In [27]:
print("Train and test dataset sizes:", train.X.shape, test.X.shape)
print("Train and Test dataset sizes (stratified):", train_strat.X.shape, test_strat.X.shape)
print("Train and Test dataset sizes (stratified), with half used for training:", train_strat_50.X.shape, test_strat_50.X.shape)

Train and test dataset sizes: (120, 4) (30, 4)
Train and Test dataset sizes (stratified): (120, 4) (30, 4)
Train and Test dataset sizes (stratified), with half used for training: (75, 4) (75, 4)


In [28]:
print("Train dataset size, with default seed:\n", train.X[:5])
print("Test dataset size, with default seed:\n", test.X[:5])

print("\nTrain dataset size (stratified), with default seed:\n", train_strat.X[:5])
print("Test dataset size (stratified), with default seed:\n", test_strat.X[:5])

print("\nTrain adataset size (stratified), with seed=5:\n", train_strat_seed.X[:5])
print("Test dataset size (stratified), with seed=5:\n", test_strat_seed.X[:5])

Train dataset size, with default seed:
 [[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]
Test dataset size, with default seed:
 [[6.1 2.8 4.7 1.2]
 [5.7 3.8 1.7 0.3]
 [7.7 2.6 6.9 2.3]
 [6.  2.9 4.5 1.5]
 [6.8 2.8 4.8 1.4]]

Train dataset size (stratified), with default seed:
 [[4.8 3.  1.4 0.1]
 [5.  3.6 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 2.9 1.4 0.2]
 [4.6 3.1 1.5 0.2]]
Test dataset size (stratified), with default seed:
 [[4.3 3.  1.1 0.1]
 [5.1 3.4 1.5 0.2]
 [4.8 3.1 1.6 0.2]
 [4.8 3.  1.4 0.3]
 [5.1 3.5 1.4 0.3]]

Train adataset size (stratified), with seed=5:
 [[5.4 3.4 1.5 0.4]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.1 3.5 1.4 0.2]
 [5.1 3.3 1.7 0.5]]
Test dataset size (stratified), with seed=5:
 [[4.4 3.2 1.3 0.2]
 [4.7 3.2 1.6 0.2]
 [4.6 3.4 1.4 0.3]
 [5.1 3.8 1.5 0.3]
 [5.2 3.4 1.4 0.2]]


# **Exercise 7:** mplementing the KNNRegressor with RMSE

7.3. Test the "KNNRegressor" class using the "cpu.csv" dataset (regression).

In [29]:
from si.metrics.rmse import rmse
from si.models.knn_regressor import KNNRegressor

In [30]:
cpu = read_csv('../datasets/cpu/cpu.csv', features=True, label=True)

cpu.y

array([ 198,  269,  220,  172,  132,  318,  367,  489,  636, 1144,   38,
         40,   92,  138,   10,   35,   19,   28,   31,  120,   30,   33,
         61,   76,   23,   69,   33,   27,   77,   27,  274,  368,   32,
         63,  106,  208,   20,   29,   71,   26,   36,   40,   52,   60,
         72,   72,   18,   20,   40,   62,   24,   24,  138,   36,   26,
         60,   71,   12,   14,   20,   16,   22,   36,  144,  144,  259,
         17,   26,   32,   32,   62,   64,   22,   36,   44,   50,   45,
         53,   36,   84,   16,   38,   38,   16,   22,   29,   40,   35,
        134,   66,  141,  189,   22,  132,  237,  465,  465,  277,  185,
          6,   24,   45,    7,   13,   16,   32,   32,   11,   11,   18,
         22,   37,   40,   34,   50,   76,   66,   24,   49,   66,  100,
        133,   12,   18,   20,   27,   45,   56,   70,   80,  136,   16,
         26,   32,   45,   54,   65,   30,   50,   40,   62,   60,   50,
         66,   86,   74,   93,  110,  143,  105,  2

In [31]:
train_cpu, test_cpu = stratified_train_test_split(cpu, test_size=0.3, random_state=5)
print("Train and test dataset sizes:", train_cpu.X.shape, test_cpu.X.shape)
print("Train dataset:\n", train_cpu.X[:5])
print("Test dataset:\n", test_cpu.X[:5])

Train and test dataset sizes: (192, 6) (17, 6)
Train dataset:
 [[ 480   96  512    0    1    1]
 [ 240  512 1000    8    1    3]
 [1100  512 1500    0    1    1]
 [ 112 1000 1000    0    1    4]
 [ 350   64   64    0    1    4]]
Test dataset:
 [[ 180  262 4000    0    1    3]
 [ 330 1000 2000    0    1    2]
 [ 900 1000 4000    4    1    2]
 [ 800  256 8000    0    1    4]
 [ 330 1000 4000    0    3    6]]


In [32]:
# knn regressor with k=1 + stratified split
kmeans = KNNRegressor()
kmeans.fit(train_cpu)
predictions = kmeans.predict(test_cpu)
predictions

array([18., 24., 22., 14., 38., 56., 32., 24., 34., 34., 44., 16., 25.,
       52., 25., 52., 70.])

In [33]:
kmeans.score(test_cpu)

13.37029191462748

In [34]:
# knn regressor with k=5 + stratified split
kmeans = KNNRegressor(k=5)
kmeans.fit(train_cpu)
predictions = kmeans.predict(test_cpu)
predictions

array([ 21.6,  21.6,  33.4,  17.6,  34.2,  67.8,  41.6,  31.6,  33. ,
        61.2, 145.8, 145.8,  23.4,  56.2,  24. ,  59.2,  67.8])

In [35]:
kmeans.score(test_cpu)

40.79434793607335

# **Testing RidgeRegression**

1. Use thedataset cpu.csv
2. Divide the dataset into train and test sets
3. Train the model. Which score do you get? And the cost?

In [36]:
from si.models.linear_regression import RidgeRegression

cpu_dataset = read_csv('../datasets/cpu/cpu.csv', features=True, label=True)

cpu_train, cpu_test = train_test_split(cpu_dataset, test_size=0.2, random_state=5)

ridge_model = RidgeRegression()
ridge_model.fit(cpu_train)
predictions = ridge_model.predict(cpu_test)
test_score = ridge_model.score(cpu_test)
test_cost = ridge_model.cost(cpu_test)

print("Score:", test_score)
print("Cost:", test_cost)

Score: 12721.613689859541
Cost: 6429.521624869436


In [37]:
cpu_train_strat, cpu_test_strat = stratified_train_test_split(cpu_dataset, test_size=0.2, random_state=5)

ridge_model2 = RidgeRegression()
ridge_model2.fit(cpu_train_strat)
predictions_strat = ridge_model2.predict(cpu_test_strat)
test_score_strat = ridge_model2.score(cpu_test_strat)
test_cost_strat = ridge_model2.cost(cpu_test_strat)

print("Score:", test_score_strat)
print("Cost:", test_cost_strat)

Score: 2962.5932159158297
Cost: 1889.033227942686


In [38]:
ridge_model2 = RidgeRegression(alpha=0.1)
ridge_model2.fit(cpu_train_strat)
predictions_strat = ridge_model2.predict(cpu_test_strat)
test_score_strat = ridge_model2.score(cpu_test_strat)
test_cost_strat = ridge_model2.cost(cpu_test_strat)

print("Score:", test_score_strat)
print("Cost:", test_cost_strat)

Score: 1107.5728118060001
Cost: 1199.0057277113979


# **TestingRidgeRegression and LogisticRegression** using the breast-bin.csv

In [39]:
from si.models.logistic_regression import LogisticRegression

bb_dataset = read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\breast_bin\breast-bin.csv', features=True, label=True)

bb_train, bb_test = stratified_train_test_split(bb_dataset, test_size=0.3, random_state=5)

# Ridge regression model
ridge_model = RidgeRegression(alpha=0.1)
ridge_model.fit(bb_train)
predictions_R = ridge_model.predict(bb_test)
test_score_R = ridge_model.score(bb_test)
test_cost_R = ridge_model.cost(bb_test)

# Logistic regression model
logistic_model = LogisticRegression(alpha=0.1)
logistic_model.fit(bb_train)
predictions_L = logistic_model.predict(bb_test)
test_score_L = logistic_model.score(bb_test)
test_cost_L = logistic_model.cost(bb_test)


In [40]:
print("Score for the Ridge Regression:", test_score_R)
print("Cost for the Ridge Regression:", test_cost_R)

print("\nScore for the Logistic Regression:", test_score_L)
print("Cost for the Logistic Regression:", test_cost_L)

Score for the Ridge Regression: 0.03588477223004671
Cost for the Ridge Regression: 0.018055436565692178

Score for the Logistic Regression: 0.9760765550239234
Cost for the Logistic Regression: 1.445038601820536


# **Exercise 9.2:** Testing Random Forest
1.Use the iris.csv dataset

2.Split the data into train and test sets

3.Create the RandomForestClassifier model

4.Train the model. What is the score of the model on the test set?

In [41]:
from si.models.random_forest_classifier import RandomForestClassifier

In [42]:
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)
train_iris, test_iris = stratified_train_test_split(iris_dataset, test_size=0.2, random_state=5)

rf_model = RandomForestClassifier(seed=5)
rf_model.fit(train_iris)

<si.models.random_forest_classifier.RandomForestClassifier at 0x123e597f020>

In [43]:
predictions_rf = rf_model.predict(test_iris)
test_score_rf = rf_model.score(test_iris)

print("Score:", test_score_rf)

Score: 0.9


# **Exercise 10.2:** Testing the StackingClassifier model

1.Use the breast-bin.csv dataset

2.Split the data into train and test sets

In [44]:
dataset_10 = read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\breast_bin\breast-bin.csv', features=True, label=True)

train_10, test_10 = stratified_train_test_split(dataset_10, test_size=0.25, random_state=5)

3.Create a KNNClassifier model

4.Create a LogisticRegression model

5.Create a DecisionTree model

6.Create a second KNNClassifier model (final model)



In [45]:
from si.models.knn_classifier import KNNClassifier
from si.models.decision_tree_classifier import DecisionTreeClassifier

knn_model = KNNClassifier()
log_reg_model = LogisticRegression()
decision_tree_model = DecisionTreeClassifier(max_depth=5)
final_model = KNNClassifier()

7.Create a StackingClassifier model using the previous classifiers. The second KNNClassifier model must be used as the final model.

8.Train the StackingClassifier model. What is the score of the model on the test set?

In [46]:
from si.ensemble.stacking_classifier import StackingClassifier

stacking_model = StackingClassifier(models=[knn_model, log_reg_model, decision_tree_model], final_model=final_model)
stacking_model.fit(train_10)
test_score_stacking = stacking_model.score(test_10)

print("Score:", test_score_stacking)

Score: 0.9827586206896551


# **Exercise 11:** Implementing the randomized_search_cv function
1. Use the breast-bin.csv dataset

In [47]:
breast_bin = read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\breast_bin\breast-bin.csv', features=True, label=True)

type(breast_bin)

si.data.dataset.Dataset

2. Create a LogisticRegression model

3. Perform a randomized search with the following hyperparameter distributions:
* l2_penalty: distribution between 1 and 10 with 10 equal intervals (e.g., np.linspace(1, 10, 10))
* alpha: distribution between 0.001 and 0.0001 with 100 equal intervals (e.g., np.linspace(0.001, 0.0001, 100))
* max_iter: distribution between 1000 and 2000 with 200 equal intervals (e.g., np.linspace(1000, 2000, 200))

In [48]:
from si.models.logistic_regression import LogisticRegression
from si.model_selection.randomized_search import randomized_search_cv
from si.model_selection.cross_validate import k_fold_cross_validation 

model_log = LogisticRegression()

hyperparameter_grid = {'l2_penalty': np.linspace(1, 10, 10) ,'alpha': np.linspace(0.001, 0.0001, 100), 
                       'max_iter': np.linspace(1000, 2000, 200) }

4. Use n_iter=10 and cv=3 folds for the cross validation.

In [49]:
dict_results = randomized_search_cv(model=model_log, 
                                    dataset=breast_bin, 
                                    hyperparameter_grid=hyperparameter_grid, 
                                    n_iter=10, cv=3)


5. Which scores do you obtain? What are the best score and best hyperparameters?

In [50]:
print("Scores:")
for score in dict_results['scores']:
    print (score)
print("\nBest hyperparameters:", dict_results['best_hyperparameters'])
print("Best score:", dict_results['best_score'])

Scores:
0.9669540229885057
0.9655172413793104
0.9669540229885057
0.9655172413793104
0.9669540229885057
0.9669540229885057
0.9669540229885057
0.9655172413793104
0.9669540229885057
0.9669540229885057

Best hyperparameters: {'l2_penalty': np.float64(3.0), 'alpha': np.float64(0.00016363636363636363), 'max_iter': np.float64(1824.1206030150754)}
Best score: 0.9669540229885057


# **Exercise 12:** Testing the Dropout layer

12.2. Test the layer with a random input and check if the output shows the desired behaviour.

In [51]:
from si.neural_networks.layers import Dropout

dropout_layer = Dropout(probability=0.5)
input_data = np.random.rand(5, 10)
dropout_layer.set_input_shape(input_data.shape)  

# Forward propagation during training --> expected to have some zeros due to the mask
output_train = dropout_layer.forward_propagation(input_data, training=True)
print("Output during training:")
print(output_train)

Output during training:
[[1.71681479 1.76990811 0.         1.01098369 0.86416605 0.
  0.57001427 1.72527462 0.         0.27492492]
 [0.         0.         0.         1.03489279 0.         0.
  1.66741543 1.27854196 0.         0.        ]
 [1.6331077  0.         1.6788027  0.         0.85633386 0.
  0.         0.         0.         0.        ]
 [0.21595473 0.21243207 0.41268677 0.02426719 0.         0.72948484
  0.         1.14853787 1.26995664 1.92199609]
 [0.         0.         1.74917661 0.8936013  1.78355493 0.
  0.         0.25720193 0.         1.64398588]]


In [52]:
# Forward propagation during inference --> shoulb the same as the input
output_inference = dropout_layer.forward_propagation(input_data, training=False)
if (output_inference == input_data).all():
    print("Output during inference is the same as the input.")

Output during inference is the same as the input.


In [53]:
# Backward propagation --> by using a dummy error for testing, we should see some zeros in the input error
error = np.ones_like(input_data) 
input_error = dropout_layer.backward_propagation(error)
print("Input error during backpropagation:")
print(input_error)

Input error during backpropagation:
[[1. 1. 0. 1. 1. 0. 1. 1. 0. 1.]
 [0. 0. 0. 1. 0. 0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 1. 1. 1.]
 [0. 0. 1. 1. 1. 0. 0. 1. 0. 1.]]


In [54]:
# Output shape that shpuld be the same as the input shape
if dropout_layer.output_shape() == input_data.shape:
    print("Output shape is correct")
else:
    print("Something is wrong: Output shape is incorrect")


Output shape is correct


In [55]:
# Parameters of the dropout layer - should be 0
if dropout_layer.parameters() == 0:
    print("Parameters: 0")
else:
    print("Something is wrong: dropout layers do not have learnable parameters")

Parameters: 0
