# 1. Знайти в датасеті таргет та видалити цю колонку з датасету (видаляти за індексом)

In [2]:
# Load data
import numpy as np
from numpy.typing import NDArray
from typing import List, Tuple

def load_iris_data(url: str) -> NDArray[Tuple[float, float, float, float, bytes]]:
    """
    Loads the Iris dataset from the specified URL.

    This function downloads the Iris dataset from the given URL and loads it into a NumPy structured array.
    The dataset consists of 150 samples, each with 4 features and a class label.

    Parameters:
    ----------
    url : str
        The URL from which to download the Iris dataset.

    Returns:
    -------
    numpy.ndarray
        A structured array with the following fields:
        - 'sepal_length': float64, the length of the sepal in centimeters.
        - 'sepal_width': float64, the width of the sepal in centimeters.
        - 'petal_length': float64, the length of the petal in centimeters.
        - 'petal_width': float64, the width of the petal in centimeters.
        - 'class': bytes, the class label (species) of the iris sample.

    Examples:
    --------
    >>> url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    >>> iris_data = load_iris_data(url)
    >>> print(iris_data[:5])
    """
    
    # Define the data types for each column
    dtype: List[Tuple[str, str]] = [
        ('sepal_length', 'f8'),
        ('sepal_width', 'f8'),
        ('petal_length', 'f8'),
        ('petal_width', 'f8'),
        ('class', 'S15')
    ]
    
    # Load data from URL
    data: NDArray[Tuple[float, float, float, float, bytes]] = np.genfromtxt(url, delimiter=',', dtype=dtype, encoding='utf-8')
    
    return data

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data(url)

# Print the first 5 rows of data for verification
print(iris_data[:5])

[(5.1, 3.5, 1.4, 0.2, b'Iris-setosa') (4.9, 3. , 1.4, 0.2, b'Iris-setosa')
 (4.7, 3.2, 1.3, 0.2, b'Iris-setosa') (4.6, 3.1, 1.5, 0.2, b'Iris-setosa')
 (5. , 3.6, 1.4, 0.2, b'Iris-setosa')]


In [3]:
def remove_target_column(data: NDArray[Tuple[float, float, float, float, bytes]]) -> NDArray[Tuple[float, float, float, float]]:
    """
    Removes the target column from the Iris dataset.

    Parameters:
    ----------
    data : numpy.ndarray
        A structured array containing the Iris dataset with the target column.

    Returns:
    -------
    numpy.ndarray
        A structured array with the target column removed.
    """
    # Define the new data types for the array without the 'class' column
    new_dtype = [('sepal_length', 'f8'), ('sepal_width', 'f8'), ('petal_length', 'f8'), ('petal_width', 'f8')]
    
    # Create a new array with the target column removed
    new_data = np.zeros(data.shape, dtype=new_dtype)
    
    for field in new_data.dtype.names:
        new_data[field] = data[field]
    
    return new_data

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data(url)

# Remove the target column
iris_data_no_target = remove_target_column(iris_data)

# Print the first 5 rows of data for verification
print(iris_data_no_target[:5])

[(5.1, 3.5, 1.4, 0.2) (4.9, 3. , 1.4, 0.2) (4.7, 3.2, 1.3, 0.2)
 (4.6, 3.1, 1.5, 0.2) (5. , 3.6, 1.4, 0.2)]


In [4]:
def load_iris_data_without_target(url: str) -> NDArray[np.float64]:
    """
    Loads the Iris dataset from the specified URL and removes the target column.

    This function downloads the Iris dataset from the given URL, removes the target column,
    and returns the data as a NumPy array with only the feature columns.

    Parameters:
    ----------
    url : str
        The URL from which to download the Iris dataset.

    Returns:
    -------
    numpy.ndarray
        A 2D array with the following columns:
        - 'sepal_length': float64, the length of the sepal in centimeters.
        - 'sepal_width': float64, the width of the sepal in centimeters.
        - 'petal_length': float64, the length of the petal in centimeters.
        - 'petal_width': float64, the width of the petal in centimeters.

    Examples:
    --------
    >>> url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    >>> iris_data = load_iris_data_without_target(url)
    >>> print(iris_data[:5])
    """
    
    # Load data from URL, use 'U15' for the last column to handle the class labels as strings
    data = np.genfromtxt(url, delimiter=',', dtype='f8, f8, f8, f8, U15', encoding='utf-8')
    
    # Extract the feature columns (all but the last column)
    features = np.array([list(row)[:4] for row in data], dtype=np.float64)
    
    return features

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data_without_target(url)

# Print the first 5 rows of data for verification
print(iris_data[:5])


[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


# 2. Перетворити колонки, що залишились в 2D масив (або впевнитись, що це уже 2D масив)

In [7]:
def load_iris_data_without_target(url: str) -> NDArray[np.float64]:
    """
    Loads the Iris dataset from the specified URL and removes the target column.

    This function downloads the Iris dataset from the given URL, removes the target column,
    and returns the data as a 2D NumPy array with only the feature columns.

    Parameters:
    ----------
    url : str
        The URL from which to download the Iris dataset.

    Returns:
    -------
    numpy.ndarray
        A 2D array with the following columns:
        - 'sepal_length': float64, the length of the sepal in centimeters.
        - 'sepal_width': float64, the width of the sepal in centimeters.
        - 'petal_length': float64, the length of the petal in centimeters.
        - 'petal_width': float64, the width of the petal in centimeters.

    Examples:
    --------
    >>> url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    >>> iris_data = load_iris_data_without_target(url)
    >>> print(iris_data[:5])
    """
    
    # Load data from URL, use 'U15' for the last column to handle the class labels as strings
    data = np.genfromtxt(url, delimiter=',', dtype='f8, f8, f8, f8, U15', encoding='utf-8')
    
    # Extract the feature columns (all but the last column) and ensure it's a 2D array
    features = np.array([list(row)[:4] for row in data], dtype=np.float64)
    
    # Ensure the resulting array is 2D
    if features.ndim != 2:
        raise ValueError("The resulting array is not 2D")
    else:
        print("The resulting array is 2D")
    
    return features

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data_without_target(url)

# Print the first 5 rows of data for verification
print(iris_data[:5])


The resulting array is 2D
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


# 3. Порахувати mean, median, standard deviation для 1-ї колонки

In [8]:
def calculate_statistics(data: NDArray[np.float64], column_index: int) -> Tuple[float, float, float]:
    """
    Calculates mean, median, and standard deviation for a specified column in the dataset.

    Parameters:
    ----------
    data : numpy.ndarray
        A 2D array containing the dataset.
    column_index : int
        The index of the column for which to calculate the statistics.

    Returns:
    -------
    Tuple[float, float, float]
        A tuple containing the mean, median, and standard deviation of the specified column.
    """
    column_data = data[:, column_index]
    mean = np.mean(column_data)
    median = np.median(column_data)
    std_dev = np.std(column_data)
    return mean, median, std_dev

iris_data = load_iris_data_without_target(url)

# Calculate statistics for the first column (sepal_length)
mean, median, std_dev = calculate_statistics(iris_data, 0)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Standard Deviation: {std_dev}")

The resulting array is 2D
Mean: 5.843333333333334
Median: 5.8
Standard Deviation: 0.8253012917851409


# 4. Вставити 20 значень np.nan на випадкові позиції в масиві (при використанні звичайного рандому можуть накластись позиції, тому знайти рішення, яке гарантує 20 унікальних позицій)

In [9]:
def insert_nan_random_positions(array: np.ndarray, num_nans: int) -> np.ndarray:
    """
    Inserts np.nan at random unique positions in the given array.

    Parameters:
    ----------
    array : numpy.ndarray
        The input array where np.nan values will be inserted.
    num_nans : int
        The number of np.nan values to insert.

    Returns:
    -------
    numpy.ndarray
        The array with np.nan values inserted at random unique positions.
    """
    # Determine unique random positions to insert np.nan
    num_elements = array.size
    random_indices = np.random.choice(num_elements, size=num_nans, replace=False)
    
    # Insert np.nan at selected positions
    array.ravel()[random_indices] = np.nan
    
    return array

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data_without_target(url)

# Create a copy of the data to avoid modifying the original loaded data
iris_data_copy = iris_data.copy()

# Insert 20 np.nan values at random unique positions
iris_data_nan = insert_nan_random_positions(iris_data_copy, num_nans=20)

# Print the modified array to verify
print(iris_data_nan)

The resulting array is 2D
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 nan 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 nan 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 nan 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 nan]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 nan 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [nan 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  nan 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4

# 5. Знайти позиції вставлених значень np.nan в 1-й колонці

In [10]:
def find_nan_positions(array: np.ndarray, column_index: int) -> np.ndarray:
    """
    Finds positions of np.nan values in the specified column of the array.

    Parameters:
    ----------
    array : numpy.ndarray
        The input array where to find np.nan values.
    column_index : int
        The index of the column in which to search for np.nan values.

    Returns:
    -------
    numpy.ndarray
        An array of indices where np.nan values are located in the specified column.
    """
    column_data = array[:, column_index]
    nan_positions = np.where(np.isnan(column_data))[0]
    return nan_positions

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data_without_target(url)

# Create a copy of the data to avoid modifying the original loaded data
iris_data_copy = iris_data.copy()

# Insert 20 np.nan values at random unique positions
iris_data_nan = insert_nan_random_positions(iris_data_copy, num_nans=20)

# Find positions of np.nan in the first column (column index 0)
nan_positions = find_nan_positions(iris_data_nan, column_index=0)

print("Positions of np.nan in the first column:")
print(nan_positions)

The resulting array is 2D
Positions of np.nan in the first column:
[  1  21  26  39  52  67 102]


# 6. Відфільтрувати массив за умовою: значення в 3-й колонці > 1.5 та значения в 1-й колонці < 5.0 (зберегти у іншу змінну)

In [11]:
def filter_data(array: np.ndarray, threshold1: float, threshold2: float) -> np.ndarray:
    """
    Filters the array based on conditions: values in the 3rd column > threshold1
    and values in the 1st column < threshold2.

    Parameters:
    ----------
    array : numpy.ndarray
        The input array to be filtered.
    threshold1 : float
        Threshold value for the 3rd column (petal_length).
    threshold2 : float
        Threshold value for the 1st column (sepal_length).

    Returns:
    -------
    numpy.ndarray
        The filtered array based on the specified conditions.
    """
    condition = (array[:, 2] > threshold1) & (array[:, 0] < threshold2)
    filtered_array = array[condition]
    return filtered_array

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_data = load_iris_data_without_target(url)

# Create a copy of the data to avoid modifying the original loaded data
iris_data_copy = iris_data.copy()

# Insert 20 np.nan values at random unique positions
iris_data_nan = insert_nan_random_positions(iris_data_copy, num_nans=20)

# Define thresholds
threshold_petal_length = 1.5
threshold_sepal_length = 5.0

# Filter the data
filtered_data = filter_data(iris_data_nan, threshold_petal_length, threshold_sepal_length)

# Print the filtered data
print("Filtered data based on conditions:")
print(filtered_data)

The resulting array is 2D
Filtered data based on conditions:
[[4.8 3.4 1.6 0.2]
 [4.8 3.4 1.9 0.2]
 [4.7 nan 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [4.9 2.4 3.3 1. ]
 [4.9 2.5 4.5 1.7]]


# 7. Замінити всі значення np.nan на 0

In [12]:
# Replace all np.nan values with 0
filtered_data_no_nan = np.nan_to_num(filtered_data, nan=0)

# Print the filtered data without NaNs replaced by 0
print("Filtered data without NaNs replaced by 0:")
print(filtered_data_no_nan)

Filtered data without NaNs replaced by 0:
[[4.8 3.4 1.6 0.2]
 [4.8 3.4 1.9 0.2]
 [4.7 0.  1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [4.9 2.4 3.3 1. ]
 [4.9 2.5 4.5 1.7]]


# 8. Порахувати всі унікальні значення в массиві та вивести їх разом із кількістю

In [14]:
# Find unique values and their counts
unique_values, counts = np.unique(filtered_data_no_nan, return_counts=True)

# Print unique values and their counts
print("Unique values and their counts:")
for value, count in zip(unique_values, counts):
    print(f"{value}: {count}")

Unique values and their counts:
0.0: 1
0.2: 4
1.0: 1
1.6: 3
1.7: 1
1.9: 1
2.4: 1
2.5: 1
3.1: 1
3.3: 1
3.4: 2
4.5: 1
4.7: 1
4.8: 3
4.9: 2


# 9. Розбити масив по вертикалі на 2 рівні частини (не використовувати абсолютні числа, мають бути два массиви по 4 колонки)

In [15]:
# Split the array vertically into two equal parts
split_arrays = np.array_split(filtered_data_no_nan, 2, axis=1)

# Print the two split arrays
print("First half of the split array:")
print(split_arrays[0])
print("\nSecond half of the split array:")
print(split_arrays[1])

First half of the split array:
[[4.8 3.4]
 [4.8 3.4]
 [4.7 0. ]
 [4.8 3.1]
 [4.9 2.4]
 [4.9 2.5]]

Second half of the split array:
[[1.6 0.2]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [3.3 1. ]
 [4.5 1.7]]


# 10. Відсортувати обидва массиви по 1-й колонці: 1-й за збільшенням, 2-й за зменшенням

In [16]:
# Sort the first array (ascending order by the first column)
sorted_array1 = split_arrays[0][np.argsort(split_arrays[0][:, 0])]

# Sort the second array (descending order by the first column)
sorted_array2 = split_arrays[1][np.argsort(-split_arrays[1][:, 0])]

# Print the sorted arrays
print("Sorted array 1 (ascending by the first column):")
print(sorted_array1)
print("\nSorted array 2 (descending by the first column):")
print(sorted_array2)

Sorted array 1 (ascending by the first column):
[[4.7 0. ]
 [4.8 3.4]
 [4.8 3.4]
 [4.8 3.1]
 [4.9 2.4]
 [4.9 2.5]]

Sorted array 2 (descending by the first column):
[[4.5 1.7]
 [3.3 1. ]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.6 0.2]]


# 11. Зібрати обидва массиви в одне ціле



In [17]:
# Concatenate the two sorted arrays vertically (along columns)
combined_array = np.concatenate((sorted_array1, sorted_array2), axis=1)

# Print the combined array
print("Combined array:")
print(combined_array)

Combined array:
[[4.7 0.  4.5 1.7]
 [4.8 3.4 3.3 1. ]
 [4.8 3.4 1.9 0.2]
 [4.8 3.1 1.6 0.2]
 [4.9 2.4 1.6 0.2]
 [4.9 2.5 1.6 0.2]]


# 12. Знайти найбільш часто повторюване значення в массиві



In [19]:
def most_frequent_value(array: np.ndarray):
    """
    Finds the most frequently occurring value in a NumPy array.

    Parameters:
    ----------
    array : numpy.ndarray
        The input NumPy array.

    Returns:
    -------
    object
        The most frequently occurring value in the array.

    Raises:
    ------
    ValueError
        If the input array is empty or if there is a tie for the most frequent value.
    """
    # Get unique values and their counts
    unique_values, counts = np.unique(array, return_counts=True)
    
    # Find the index of the maximum count
    max_count_index = np.argmax(counts)
    
    # Check if there is a tie for the most frequent value
    if np.sum(counts == counts[max_count_index]) > 1:
        raise ValueError("There is a tie for the most frequent value.")
    
    # Return the most frequent value
    most_frequent_value = unique_values[max_count_index]
    return most_frequent_value

# Find the most frequently occurring value
try:
    result = most_frequent_value(combined_array)
    print("Most frequent value:", result)
except ValueError as e:
    print(e)

Most frequent value: 0.2


# 13. Написати функцію, яка б множила всі значення в колонці, які менше середнього значения в цій колонці, на 2, і ділила інші значення на 4.



In [29]:
def process_column(array: np.ndarray, column_index: int):
    """
    Process a specific column in a NumPy array:
    - Multiply values less than the column mean by 2.
    - Divide other values by 4.

    Parameters:
    ----------
    array : numpy.ndarray
        The input NumPy array.
    column_index : int
        Index of the column to process.

    Returns:
    -------
    numpy.ndarray
        The processed array with modifications applied to the specified column.
    """
    # Get the specified column
    column = array[:, column_index]
    
    # Calculate the mean of the column
    column_mean = np.mean(column)
    
    # Apply the transformations
    column_processed = np.where(column < column_mean, column * 2, column / 4)
    
    # Replace the processed column back into the original array
    processed_array = array.copy()
    processed_array[:, column_index] = column_processed
    
    return processed_array


# 14. Застосувати отриману функцію до 3-ї колонки

In [30]:
processed_column = process_column(combined_array, 2)
print("\nProcessed columns:")
print(processed_column)


Processed columns:
[[4.7   0.    1.125 1.7  ]
 [4.8   3.4   0.825 1.   ]
 [4.8   3.4   3.8   0.2  ]
 [4.8   3.1   3.2   0.2  ]
 [4.9   2.4   3.2   0.2  ]
 [4.9   2.5   3.2   0.2  ]]
