# Exercise 1: NumPy array Indexing/Slicing

**Ex 1.1:** Load the "iris.csv" using the appropriate method for this file type (use the new functions from the package)

In [2]:
import pandas as pd

iris_df = pd.read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\iris\iris.csv')
print(iris_df.head())

   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


**Ex 1.2:** Select the penultimate independent variable. What is the dimension of the resulting array?

In [3]:
penultimate_variable = iris_df.iloc[:, -2]
print("Dimension of the resulting array:", penultimate_variable.shape)

Dimension of the resulting array: (150,)


**Ex 1.3:** Select the last 10 samples from the iris dataset. What is the mean of the last 10 samples for each
independent variable/feature?

In [4]:
last10 = iris_df.iloc[-10:, :-1]
mean_last10 = last10.mean()
print("Mean of the last 10 samples for each independent variable/feature:\n", mean_last10)

Mean of the last 10 samples for each independent variable/feature:
 sepal_length    6.45
sepal_width     3.03
petal_length    5.33
petal_width     2.17
dtype: float64


**Ex 1.4:** Select all samples from the dataset with values less than or equal to 6 for all independent variables/features. How many samples do you obtain?

In [5]:
filtered_samples = iris_df[(iris_df.iloc[:, :-1] <= 6).all(axis=1)]
num_samples = filtered_samples.shape[0]
print("Number of samples with values less than or equal to 6 for all independent variables/features:", num_samples)

Number of samples with values less than or equal to 6 for all independent variables/features: 89


**Ex 1.5:** Select all samples with a class/label different from 'Iris-setosa'. How many samples do you obtain?

In [6]:
samples = iris_df[iris_df['class'] != 'Iris-setosa']
num_samples = samples = iris_df[iris_df['class'] != 'Iris-setosa'].shape[0]
print("Number of samples with a class/label different from 'Iris-setosa':", num_samples)

Number of samples with a class/label different from 'Iris-setosa': 100


# **Exercise 2:** 

Examples of how to use the fillna, dropna and remove_by_index methods

In [7]:
import numpy as np
from si.data.dataset import Dataset

In [8]:
#Turning the iris dataset into a Dataset object and adding a row with NaN values
X = last10.values
new_row = np.array([np.nan, 3., 5.6, np.nan])
X = np.vstack([X, new_row])
y = iris_df.iloc[-10:, -1].values
new_y = np.array(['Iris-setosa'])
y = np.append(y, new_y)
dataset = Dataset(X, y)

print("Original Dataset:")
print("X:", dataset.X)
print("y:", dataset.y)

Original Dataset:
X: [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
y: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [9]:
if np.isnan(dataset.X).any() :
    print("The dataset contains missing values.")

The dataset contains missing values.


In [10]:
#Making a copy of the dataset
X_copy = np.copy(dataset.X)
y_copy = np.copy(dataset.y) 
dataset_copy = Dataset(X_copy, y_copy)

print("Dataset copy: \n", dataset_copy.X)
print(dataset_copy.y)

# Removing samples with missing values using the dropna method
cleaned_dataset = dataset_copy.dropna()
print("\n X after dropna:\n", cleaned_dataset.X)
print("y after dropna:", cleaned_dataset.y)

Dataset copy: 
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']

 X after dropna:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after dropna: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


In [11]:
# Filling missing values with the mean of the corresponding feature using the fillna method
filled_dataset = dataset.fillna(10.0)
print("X after fillna:\n", filled_dataset.X)
print("y after fillna:\n", filled_dataset.y)


X after fillna:
 [[ 6.7  3.1  5.6  2.4]
 [ 6.9  3.1  5.1  2.3]
 [ 5.8  2.7  5.1  1.9]
 [ 6.8  3.2  5.9  2.3]
 [ 6.7  3.3  5.7  2.5]
 [ 6.7  3.   5.2  2.3]
 [ 6.3  2.5  5.   1.9]
 [ 6.5  3.   5.2  2. ]
 [ 6.2  3.4  5.4  2.3]
 [ 5.9  3.   5.1  1.8]
 [10.   3.   5.6 10. ]]
y after fillna:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [12]:
# Removing the last sample using the remove_by_index method
removed_dataset = filled_dataset.remove_by_index(-1)
print("Size of the modified dataset:", dataset.X.shape)
print("X after remove_by_index:\n", removed_dataset.X)
print("y after remove_by_index:\n", removed_dataset.y)


Size of the modified dataset: (10, 4)
X after remove_by_index:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after remove_by_index:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


# **Exercise 3:** Implementing SelectPercentile

Testing the SelectPercentile class using the "iris.csv" dataset 

In [13]:
from si.io.csv_file import read_csv
from si.statistics.f_classification import f_classification
from si.feature_selection.select_percentile import SelectPercentile

In [14]:
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)

# Create an instance of the SelectPercentile class
selector = SelectPercentile(percentile=50)
# Fit the selector to the dataset
selector.fit(iris_dataset)

# Transform the dataset using the fitted model
transformed_dataset = selector.transform(iris_dataset)

# Print the results
print("Original dataset shape:", iris_dataset.X.shape)
print("Transformed dataset shape:", transformed_dataset.X.shape)

Original dataset shape: (150, 4)
Transformed dataset shape: (150, 2)


In [15]:
selector2 = SelectPercentile()
selector2.fit(iris_dataset)
transformed_dataset2 = selector.transform(iris_dataset)

print("F values:", selector2.F)
print("p values:", selector2.p)

print("Original dataset shape:", iris_dataset.X.shape)
print("Transformed dataset shape:", transformed_dataset2.X.shape)
print("Original features:\n", iris_dataset.features)
print("Selected features:\n", transformed_dataset2.features)

F values: [ 119.26450218   47.3644614  1179.0343277   959.32440573]
p values: [1.66966919e-31 1.32791652e-16 3.05197580e-91 4.37695696e-85]
Original dataset shape: (150, 4)
Transformed dataset shape: (150, 2)
Original features:
 Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
Selected features:
 ['petal_width', 'petal_length']


# **Exercício 5:** PCA

Testing the PCA class in a jupyter notebook using the iris.csv dataset.

In [16]:
from si.decomposition.pca import PCA
from sklearn.decomposition import PCA as PCA_sklearn

In [17]:
iris_dataset = read_csv('../datasets/iris/iris.csv', features=True, label=True)

In [18]:
# PCA with 2 components using PCA class
np.random.seed(5)
pca = PCA(n_components=2)
pca._fit(iris_dataset.X)
X_transformed = pca._transform(iris_dataset.X)


In [19]:
# PCA with 2 components using PCA class from scikit-learn
np.random.seed(5)
pca_sklearn = PCA_sklearn(n_components=2)
pca_sklearn.fit(iris_dataset.X)
X_transformed_sklearn = pca_sklearn.transform(iris_dataset.X)

In [20]:
print("Explained variance:", pca.explained_variance)
print("Explained variance by PCA from scikit-learn:", pca_sklearn.explained_variance_)

print("\nTransformed data structure:", X_transformed.shape)
print("Transformed data structure by PCA from scikit-learn:", X_transformed_sklearn.shape)

print ("\nFirst five lines of the Transformed data:\n", X_transformed[:5])
print("First five lines of the Transformed data by PCA from scikit-learn:\n", X_transformed_sklearn[:5])

Explained variance: [0.92461621 0.05301557]
Explained variance by PCA from scikit-learn: [4.22484077 0.24224357]

Transformed data structure: (150, 2)
Transformed data structure by PCA from scikit-learn: (150, 2)

First five lines of the Transformed data:
 [[-2.68420713 -0.32660731]
 [-2.71539062  0.16955685]
 [-2.88981954  0.13734561]
 [-2.7464372   0.31112432]
 [-2.72859298 -0.33392456]]
First five lines of the Transformed data by PCA from scikit-learn:
 [[-2.68420713  0.32660731]
 [-2.71539062 -0.16955685]
 [-2.88981954 -0.13734561]
 [-2.7464372  -0.31112432]
 [-2.72859298  0.33392456]]


In [21]:
# PCA with 2 components using PCA class and normalizing the data
pca_norm = PCA(n_components=2)
pca_norm._fit(iris_dataset.X, normalization=True)
X_transformed_norm = pca_norm._transform(iris_dataset.X)


In [22]:
# PCA with 2 components using PCA class and normalizing the data using the StandardScaler
from sklearn.preprocessing import StandardScaler 
np.random.seed(5)
X_scaled = StandardScaler().fit_transform(iris_dataset.X)
pca_scaled = PCA(n_components=2)
pca_scaled._fit(X_scaled)
X_transformed_scaled = pca_scaled._transform(X_scaled)

In [24]:
print("Explained variance:", pca_norm.explained_variance)
print("Explained variance using the StandardScaler:", pca_scaled.explained_variance)

print("\nTransformed data structure (normalized):", X_transformed_norm.shape)
print("Transformed data structure using the StandardScaler:", X_transformed_scaled.shape)

print ("\nFirst five lines of the Transformed data (normalized):\n", X_transformed_norm[:5])
print("First five lines of the Transformed data using the StandardScaler:\n", X_transformed_scaled[:5])

Explained variance: [0.72770452 0.23030523]
Explained variance using the StandardScaler: [0.72770452 0.23030523]

Transformed data structure (normalized): (150, 2)
Transformed data structure using the StandardScaler: (150, 2)

First five lines of the Transformed data (normalized):
 [[-2.44159388 -0.02095745]
 [-2.41439075  0.51628447]
 [-2.62966145  0.40774632]
 [-2.53931232  0.53331485]
 [-2.52016653 -0.07628126]]
First five lines of the Transformed data using the StandardScaler:
 [[-2.26454173 -0.5057039 ]
 [-2.0864255   0.65540473]
 [-2.36795045  0.31847731]
 [-2.30419716  0.57536771]
 [-2.38877749 -0.6747674 ]]
