## Dataset Analysis

In [1]:
import pandas as pd
import numpy as np
import os
_FOLDER = "data/"
_FOLDER_2 = "results/"
_FOLDER_3 = "datasets/"

## Filtration Scenarario 1 Analysis 

In [4]:

# Size of the dataset after the initial filtration (as provided)
original_filtered_dataset_size = 2776
original_dataset_reduction = 225384  # Initial dataset size before any filtration

print(f"Original dataset size: {original_dataset_reduction}")
print(f"Dataset size after initial filtration: {original_filtered_dataset_size}")

# Loop through the datasets created in the first filtration scenario
for i in range(1, 6):  # Assuming there are 5 datasets
    dataset_path = _FOLDER_3 + f"filtering_scenario_1.{i}.csv"
    filtered_dataset = pd.read_csv(dataset_path)
    filtered_dataset_size = filtered_dataset.shape[0]
    
    # Calculate the reduction percentage from the original dataset
    reduction_from_original = ((original_dataset_reduction - filtered_dataset_size) / original_dataset_reduction) * 100
    
    # Calculate the increase from the original filtered dataset
    increase_from_original_filtered = ((filtered_dataset_size - original_filtered_dataset_size) / original_filtered_dataset_size) * 100
    
    print(f"Dataset {i}:")
    print(f"  Size after filtration: {filtered_dataset_size}")
    print(f"  Reduction from original dataset: {reduction_from_original:.2f}%")
    print(f"  Increase from initial filtration: {increase_from_original_filtered:.2f}%\n")

Original dataset size: 225384
Dataset size after initial filtration: 2776
Dataset 1:
  Size after filtration: 3667
  Reduction from original dataset: 98.37%
  Increase from initial filtration: 32.10%

Dataset 2:
  Size after filtration: 4543
  Reduction from original dataset: 97.98%
  Increase from initial filtration: 63.65%

Dataset 3:
  Size after filtration: 5436
  Reduction from original dataset: 97.59%
  Increase from initial filtration: 95.82%

Dataset 4:
  Size after filtration: 6308
  Reduction from original dataset: 97.20%
  Increase from initial filtration: 127.23%

Dataset 5:
  Size after filtration: 7145
  Reduction from original dataset: 96.83%
  Increase from initial filtration: 157.38%



## Filtration Scenario 2 Analysis

In [6]:
# Size of the dataset after the initial filtration (as provided)
original_filtered_dataset_size = 2776
original_dataset_reduction = 225384  # Initial dataset size before any filtration

print(f"Original dataset size: {original_dataset_reduction}")
print(f"Dataset size after initial filtration: {original_filtered_dataset_size}")

# Loop through the datasets created in the first filtration scenario
for i in range(1, 4):  
    dataset_path = _FOLDER_3 + f"filtering_scenario_2.{i}.csv"
    filtered_dataset = pd.read_csv(dataset_path)
    filtered_dataset_size = filtered_dataset.shape[0]
    
    # Calculate the reduction percentage from the original dataset
    reduction_from_original = ((original_dataset_reduction - filtered_dataset_size) / original_dataset_reduction) * 100
    
    # Calculate the increase from the original filtered dataset
    increase_from_original_filtered = ((filtered_dataset_size - original_filtered_dataset_size) / original_filtered_dataset_size) * 100
    
    print(f"Dataset {i}:")
    print(f"  Size after filtration: {filtered_dataset_size}")
    print(f"  Reduction from original dataset: {reduction_from_original:.2f}%")
    print(f"  Increase from initial filtration: {increase_from_original_filtered:.2f}%\n")

Original dataset size: 225384
Dataset size after initial filtration: 2776
Dataset 1:
  Size after filtration: 2956
  Reduction from original dataset: 98.69%
  Increase from initial filtration: 6.48%

Dataset 2:
  Size after filtration: 3125
  Reduction from original dataset: 98.61%
  Increase from initial filtration: 12.57%

Dataset 3:
  Size after filtration: 3276
  Reduction from original dataset: 98.55%
  Increase from initial filtration: 18.01%



## Filtration Scnerario 3 Analysis

In [7]:
# Size of the dataset after the initial filtration (as provided)
original_filtered_dataset_size = 2776
original_dataset_reduction = 225384  # Initial dataset size before any filtration

print(f"Original dataset size: {original_dataset_reduction}")
print(f"Dataset size after initial filtration: {original_filtered_dataset_size}")

# Loop through the datasets created in the first filtration scenario
for i in range(1, 16):  
    dataset_path = _FOLDER_3 + f"filtering_scenario_3.{i}.csv"
    filtered_dataset = pd.read_csv(dataset_path)
    filtered_dataset_size = filtered_dataset.shape[0]
    
    # Calculate the reduction percentage from the original dataset
    reduction_from_original = ((original_dataset_reduction - filtered_dataset_size) / original_dataset_reduction) * 100
    
    # Calculate the increase from the original filtered dataset
    increase_from_original_filtered = ((filtered_dataset_size - original_filtered_dataset_size) / original_filtered_dataset_size) * 100
    
    print(f"Dataset {i}:")
    print(f"  Size after filtration: {filtered_dataset_size}")
    print(f"  Reduction from original dataset: {reduction_from_original:.2f}%")
    print(f"  Increase from initial filtration: {increase_from_original_filtered:.2f}%\n")

Original dataset size: 225384
Dataset size after initial filtration: 2776
Dataset 1:
  Size after filtration: 3902
  Reduction from original dataset: 98.27%
  Increase from initial filtration: 40.56%

Dataset 2:
  Size after filtration: 4144
  Reduction from original dataset: 98.16%
  Increase from initial filtration: 49.28%

Dataset 3:
  Size after filtration: 4356
  Reduction from original dataset: 98.07%
  Increase from initial filtration: 56.92%

Dataset 4:
  Size after filtration: 4848
  Reduction from original dataset: 97.85%
  Increase from initial filtration: 74.64%

Dataset 5:
  Size after filtration: 5153
  Reduction from original dataset: 97.71%
  Increase from initial filtration: 85.63%

Dataset 6:
  Size after filtration: 5436
  Reduction from original dataset: 97.59%
  Increase from initial filtration: 95.82%

Dataset 7:
  Size after filtration: 5788
  Reduction from original dataset: 97.43%
  Increase from initial filtration: 108.50%

Dataset 8:
  Size after filtration: 

Analysis into the 3 filtering scenrarios showed that Scenario 1 caused a greater increase dataset than Scenario 2. When the limits of the first point and upper point are kept constant, every 0.01 increase in tolerence levels leads to a roughly 30% increase in the dataset. For Scenario 2, when the tolerence level is kept constant, every 0.05 increase in the lower platue and a decrease in the upper platue lead to a roughly 6% increase in the dataset.

For Filtering Scenario 3, further analysis is required as it is a combination of the last two scenarios. 

## Further Analysis of Filtering Scneario 3

In [10]:
# Parameters used in the third scenario
tolerance_values = [0.06, 0.07, 0.08, 0.09, 0.10]
first_points_lower_limits = [0.75, 0.7, 0.65]
last_points_upper_limits = [0.45, 0.5, 0.55]

# Original dataset sizes for comparison
original_filtered_dataset_size = 2776
original_dataset_size = 225384

# Initialize a dictionary to store results
tolerance_impact_details = {}

previous_dataset_size = original_filtered_dataset_size

# Process datasets
scenario_counter = 1
for tolerance_index, tolerance in enumerate(tolerance_values):
    tolerance_impact_details[tolerance] = []
    
    # For each tolerance level after the first, reset the comparison base to the last dataset of the previous tolerance level
    if tolerance_index > 0:
        previous_dataset_size = tolerance_impact_details[tolerance_values[tolerance_index - 1]][-1]["dataset_size"]
    
    for first_limit, last_limit in zip(first_points_lower_limits, last_points_upper_limits):
        dataset_path = _FOLDER_3 + f'filtering_scenario_3.{scenario_counter}.csv'
        filtered_dataset = pd.read_csv(dataset_path)
        filtered_dataset_size = filtered_dataset.shape[0]
        
        # Calculate the percentage increase from the previous dataset size
        if scenario_counter == 1 or (first_limit == first_points_lower_limits[0] and tolerance == tolerance_values[0]):
            # For the very first dataset of the entire series, compare against the original filtered dataset size
            increase_from_previous = ((filtered_dataset_size - original_filtered_dataset_size) / original_filtered_dataset_size) * 100
        else:
            # For all other datasets, compare against the last dataset size
            increase_from_previous = ((filtered_dataset_size - previous_dataset_size) / previous_dataset_size) * 100
        
        # Update for the next iteration
        previous_dataset_size = filtered_dataset_size
        
        # Store results
        tolerance_impact_details[tolerance].append({
            "first_limit": first_limit,
            "last_limit": last_limit,
            "dataset_size": filtered_dataset_size,
            "increase_from_previous": increase_from_previous
        })
        
        scenario_counter += 1

# Print out the detailed analysis results
for tolerance, details in tolerance_impact_details.items():
    print(f"Tolerance: {tolerance}")
    for detail in details:
        print(f"  First limit: {detail['first_limit']}, Last limit: {detail['last_limit']}, Dataset size: {detail['dataset_size']}, Increase from previous: {detail['increase_from_previous']:.2f}%")
    print("\n")

Tolerance: 0.06
  First limit: 0.75, Last limit: 0.45, Dataset size: 3902, Increase from previous: 40.56%
  First limit: 0.7, Last limit: 0.5, Dataset size: 4144, Increase from previous: 6.20%
  First limit: 0.65, Last limit: 0.55, Dataset size: 4356, Increase from previous: 5.12%


Tolerance: 0.07
  First limit: 0.75, Last limit: 0.45, Dataset size: 4848, Increase from previous: 11.29%
  First limit: 0.7, Last limit: 0.5, Dataset size: 5153, Increase from previous: 6.29%
  First limit: 0.65, Last limit: 0.55, Dataset size: 5436, Increase from previous: 5.49%


Tolerance: 0.08
  First limit: 0.75, Last limit: 0.45, Dataset size: 5788, Increase from previous: 6.48%
  First limit: 0.7, Last limit: 0.5, Dataset size: 6171, Increase from previous: 6.62%
  First limit: 0.65, Last limit: 0.55, Dataset size: 6534, Increase from previous: 5.88%


Tolerance: 0.09
  First limit: 0.75, Last limit: 0.45, Dataset size: 6733, Increase from previous: 3.05%
  First limit: 0.7, Last limit: 0.5, Dataset

: 

Some interesting dicovering were made. At lower tolerence levels change the upper and lower platues provided a lower change than at higher tolerence levels. 