Exercise 1: NumPy array Indexing/Slicing

1.1) In this exercise, we will use the iris dataset. Load
the "iris.csv" using the appropriate method for this file
type (use the new functions from the package).

In [None]:
import numpy as np
from si.io.csv_file import read_csv

iris= read_csv("../datasets/iris/iris.csv", features=True, label=True)

print(iris.shape())
iris.has_label()
iris.features

(150, 4)


Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')


1.2) Select the penultimate independent variable.
What is the dimension of the resulting array?

In [None]:
pen_variable = iris.X[:, -2]

# Determine the size of the resulting array
n_dim = pen_variable.ndim

print(f"The dimension of the rusinting array is of {n_dim}D")

The dimension of the rusinting array is of 1
(150,)


1.3) Select the last 10 samples from the iris dataset.
What is the mean of the last 10 samples for each
independent variable/feature?


In [None]:
last10=iris.X[-10:,]

print(last10)

#Axis 0 allows me to calculate mean per rwo which means per feature 
print(f" The mean of the last 10 samples for each independent variable/feature is : \
      \n {np.mean(last10, axis=0)}")

print(np.mean(last10)) # mean of all features 



[[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
 the mean of the last 10 samples for each independent variable/feature is :       
 [6.45 3.03 5.33 2.17]
4.245


1.4) Select all samples from the dataset with values
less than or equal to 6 for all independent
variables/features. How many samples do you obtain?

In [None]:
mask= iris.X[:,] <= 6

filter=np.all(mask, axis=1)

print(f"There are {iris.X[filter].shape[0]} samples from the dataset with values <=6")

There are 89 samples from the dataset with values <=6


1.5) Select all samples with a class/label different
from 'Iris-setosa'. How many samples do you obtain?

In [10]:
mask= iris.y != 'Iris-setosa'

print(f"There are {iris.X[mask].shape[0]} samples with a class/label different from 'Iris-setosa' ")

There are 100 samples with a class/label different from 'Iris-setosa' 


Exercise 2: NumPy array Indexing/Slicing

Add examples of how to use these methods to the
script/notebook of Exercise 1

In [None]:
path = "../datasets/iris/iris_missing_data.csv"
dataset = read_csv(filename=path, features=True, label=True)

print("Dataset shape before dropna")
print("dataset.X shape:", dataset.X.shape)
print("dataset.y shape:", dataset.y.shape)

dataset.dropna()
print("Dataset shape after dropna")
print("dataset.X shape:", dataset.X.shape)
print("dataset.y shape:", dataset.y.shape)

Dataset shape before dropna
dataset.X shape: (150, 4)
dataset.y shape: (150,)
Dataset shape after dropna
dataset.X shape: (134, 4)
dataset.y shape: (134,)
dataset.X has all NaNs filled: True


In [21]:
path = "../datasets/iris/iris_missing_data.csv"
dataset = read_csv(filename=path, features=True, label=True)
print(dataset.X[2,:])
print("dataset.X has all NaNs filled:", np.all(np.isnan(dataset.X) == False))

dataset.fillna(value=5.0)
print(dataset.X[2,:])
print("dataset.X has all NaNs filled:", np.all(np.isnan(dataset.X) == False))

dataset = read_csv(filename=path, features=True, label=True)
dataset.fillna(value='mean')
print(dataset.X[2,:])
print("dataset.X has all NaNs filled:", np.all(np.isnan(dataset.X) == False))

dataset = read_csv(filename=path, features=True, label=True)
dataset.fillna(value='median')
print(dataset.X[2,:])
print("dataset.X has all NaNs filled:", np.all(np.isnan(dataset.X) == False))

[4.7 nan 1.3 0.2]
dataset.X has all NaNs filled: False
[4.7 5.  1.3 0.2]
dataset.X has all NaNs filled: True
[4.7        3.04965517 1.3        0.2       ]
dataset.X has all NaNs filled: True
[4.7 3.  1.3 0.2]
dataset.X has all NaNs filled: True


In [23]:
path = "../datasets/iris/iris.csv"
dataset = read_csv(filename=path, features=True, label=True)

print("Dataset shape before removing the first sample")
print("dataset.X shape:", dataset.X.shape)
print("dataset.y shape:", dataset.y.shape)

dataset.remove_by_index(index=0)
print("Dataset shape after removing the first sample")
print("dataset.X shape:", dataset.X.shape)
print("dataset.y shape:", dataset.y.shape)

Dataset shape before removing the first sample
dataset.X shape: (150, 4)
dataset.y shape: (150,)
Dataset shape after removing the first sample
dataset.X shape: (149, 4)
dataset.y shape: (149,)


3.3) Test the SelectPercentile class in a Jupyter
notebook using the "iris.csv" dataset
(classification).

In [1]:
import numpy as np
import pandas as pd
from si.data.dataset import Dataset
from si.feature_selection.select_percentile import SelectPercentile
from si.statistics.f_classification import f_classification
from si.io.csv_file import read_csv

# Load the Iris dataset
iris_dataset = read_csv(filename="../datasets/iris/iris.csv", features=True, label=True)

# Print dataset information
print("Features:", iris_dataset.features)
print("X shape:", iris_dataset.X.shape)
print("y shape:", iris_dataset.y.shape)

# Initialize SelectPercentile with f_classification and percentile=50
selector = SelectPercentile(score_func=f_classification, percentile=50)

# Fit the selector to the dataset
selector.fit(iris_dataset)

# Print F-values and p-values
print("F-values:", selector.F)
print("p-values:", selector.p)

# Transform the dataset to select the top 50% of features
transformed_dataset = selector.transform(iris_dataset)

# Print the transformed dataset information
print("Selected features:", transformed_dataset.features)
print("X shape after transform:", transformed_dataset.X.shape)
print("Selected X:\n", transformed_dataset.X)

# Test with 100% percentile (all features)
selector_100 = SelectPercentile(score_func=f_classification, percentile=100)
selector_100.fit(iris_dataset)
transformed_dataset_100 = selector_100.transform(iris_dataset)
print("\nAll features selected (100%):", transformed_dataset_100.features)
print("X shape:", transformed_dataset_100.X.shape)

# Test with 0% percentile (no features)
selector_0 = SelectPercentile(score_func=f_classification, percentile=0)
selector_0.fit(iris_dataset)
transformed_dataset_0 = selector_0.transform(iris_dataset)
print("\nNo features selected (0%):", transformed_dataset_0.features)
print("X shape:", transformed_dataset_0.X.shape)

# Verify that the number of selected features matches the expected number
expected_num_features = int(iris_dataset.X.shape[1] * 0.5)
print(f"\nExpected number of features for 50%: {expected_num_features}")
print(f"Actual number of features selected: {len(transformed_dataset.features)}")


Features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
X shape: (150, 4)
y shape: (150,)
F-values: [ 119.26450218   47.3644614  1179.0343277   959.32440573]
p-values: [1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]
Selected features: ['petal_length', 'petal_width']
X shape after transform: (150, 2)
Selected X:
 [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.1]
 [1.2 0.2]
 [1.3 0.2]
 [1.5 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3