# Exercise 1: NumPy array Indexing/Slicing

**Ex 1.1:** Load the "iris.csv" using the appropriate method for this file type (use the new functions from the package)

In [72]:
import pandas as pd

iris_df = pd.read_csv(r'C:\Users\35191\Documents\GitHub\si\datasets\iris\iris.csv')
print(iris_df.head())

   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


**Ex 1.2:** Select the penultimate independent variable. What is the dimension of the resulting array?

In [73]:
penultimate_variable = iris_df.iloc[:, -2]
print("Dimension of the resulting array:", penultimate_variable.shape)

Dimension of the resulting array: (150,)


**Ex 1.3:** Select the last 10 samples from the iris dataset. What is the mean of the last 10 samples for each
independent variable/feature?

In [74]:
last10 = iris_df.iloc[-10:, :-1]
mean_last10 = last10.mean()
print("Mean of the last 10 samples for each independent variable/feature:\n", mean_last10)

Mean of the last 10 samples for each independent variable/feature:
 sepal_length    6.45
sepal_width     3.03
petal_length    5.33
petal_width     2.17
dtype: float64


**Ex 1.4:** Select all samples from the dataset with values less than or equal to 6 for all independent variables/features. How many samples do you obtain?

In [75]:
filtered_samples = iris_df[(iris_df.iloc[:, :-1] <= 6).all(axis=1)]
num_samples = filtered_samples.shape[0]
print("Number of samples with values less than or equal to 6 for all independent variables/features:", num_samples)

Number of samples with values less than or equal to 6 for all independent variables/features: 89


**Ex 1.5:** Select all samples with a class/label different from 'Iris-setosa'. How many samples do you obtain?

In [76]:
samples = iris_df[iris_df['class'] != 'Iris-setosa']
num_samples = samples = iris_df[iris_df['class'] != 'Iris-setosa'].shape[0]
print("Number of samples with a class/label different from 'Iris-setosa':", num_samples)

Number of samples with a class/label different from 'Iris-setosa': 100


# **Exercise 2**

Examples of how to use the fillna, dropna and remove_by_index methods

In [77]:
import numpy as np
from si.data.dataset import Dataset

In [78]:
#Turning the iris dataset into a Dataset object and adding a row with NaN values
X = last10.values
new_row = np.array([np.nan, 3., 5.6, np.nan])
X = np.vstack([X, new_row])
y = iris_df.iloc[-10:, -1].values
new_y = np.array(['Iris-setosa'])
y = np.append(y, new_y)
dataset = Dataset(X, y)

print("Original Dataset:")
print("X:", dataset.X)
print("y:", dataset.y)

Original Dataset:
X: [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
y: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [79]:
if np.isnan(dataset.X).any() :
    print("The dataset contains missing values.")

The dataset contains missing values.


In [80]:
#Making a copy of the dataset
X_copy = np.copy(dataset.X)
y_copy = np.copy(dataset.y) 
dataset_copy = Dataset(X_copy, y_copy)

print("Dataset copy: \n", dataset_copy.X)
print(dataset_copy.y)

# Removing samples with missing values using the dropna method
cleaned_dataset = dataset_copy.dropna()
print("\n X after dropna:\n", cleaned_dataset.X)
print("y after dropna:", cleaned_dataset.y)

Dataset copy: 
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]
 [nan 3.  5.6 nan]]
['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']

 X after dropna:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after dropna: ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


In [81]:
# Filling missing values with the mean of the corresponding feature using the fillna method
filled_dataset = dataset.fillna(10.0)
print("X after fillna:\n", filled_dataset.X)
print("y after fillna:\n", filled_dataset.y)


X after fillna:
 [[ 6.7  3.1  5.6  2.4]
 [ 6.9  3.1  5.1  2.3]
 [ 5.8  2.7  5.1  1.9]
 [ 6.8  3.2  5.9  2.3]
 [ 6.7  3.3  5.7  2.5]
 [ 6.7  3.   5.2  2.3]
 [ 6.3  2.5  5.   1.9]
 [ 6.5  3.   5.2  2. ]
 [ 6.2  3.4  5.4  2.3]
 [ 5.9  3.   5.1  1.8]
 [10.   3.   5.6 10. ]]
y after fillna:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-setosa']


In [82]:
# Removing the last sample using the remove_by_index method
removed_dataset = filled_dataset.remove_by_index(-1)
print("Size of the modified dataset:", dataset.X.shape)
print("X after remove_by_index:\n", removed_dataset.X)
print("y after remove_by_index:\n", removed_dataset.y)


Size of the modified dataset: (10, 4)
X after remove_by_index:
 [[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
y after remove_by_index:
 ['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']
