# Outlier Detection

In [12]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from spectroscopy import LeafSampleReader, DataReducer, DataCleaner, TargetScaler, BaselineCorrector, Metrics, PerformancePlotter, DataSummariser
from spectroscopy.src.utility_functions import get_working_directory, train_test_split

In [2]:
working_directory_path = get_working_directory()
leaf_samples_folder_path = f"{working_directory_path}/data/leaf_samples"
leaf_sample_reader = LeafSampleReader(leaf_samples_folder_path)
ds = DataSummariser(get_working_directory())

In [15]:
dried_df = leaf_sample_reader.read_all_csvs(leaf_state="dried")
fresh_df = leaf_sample_reader.read_all_csvs(leaf_state="fresh")

In [16]:
dried_df = DataCleaner.enforce_data_types(dried_df)
dried_df = DataCleaner.drop_null_data(dried_df, row_threshold=0.5, target_col_threshold=0.5, feature_col_threshold=0.5)
dried_df = DataCleaner.impute_data(dried_df, target_method="knn", feature_method="neighbour_avg")
fresh_df = DataCleaner.enforce_data_types(fresh_df)
fresh_df = DataCleaner.drop_null_data(fresh_df, row_threshold=0.5, target_col_threshold=0.5, feature_col_threshold=0.5)
fresh_df = DataCleaner.impute_data(fresh_df, target_method="knn", feature_method="neighbour_avg")
targets_dried_df = leaf_sample_reader.extract_targets(dried_df)
targets_fresh_df = leaf_sample_reader.extract_targets(fresh_df)

In [24]:
training_df, testing_df = train_test_split(dried_df, method="stratified")
print(training_df[training_df["season"]==4].index)
some, dried_indices_targets = DataCleaner.remove_outliers(training_df, method="both")

Index([327, 380, 356, 353, 378, 361, 283, 332, 374, 350, 288, 339, 372, 295,
       322, 310, 345, 328, 290, 340, 297, 371, 344, 354, 368, 314, 301, 349,
       342, 285, 331, 376, 336, 308, 306, 287, 381, 284, 323, 358, 292, 294,
       299, 286, 330, 281, 377, 335, 293, 337, 291, 357, 324, 319, 343, 303,
       302, 309, 307, 363, 351, 312, 379, 370, 282, 317, 329, 296, 382, 383,
       325, 320, 289, 367, 365, 359, 347, 333, 338, 300, 326, 375, 311],
      dtype='int64')
<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, 29 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PC1     31 non-null     float64
 1   PC2     31 non-null     float64
dtypes: float64(2)
memory usage: 744.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 115 entries, 62 to 115
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PC1     115 non-null    float64
 1   PC2     11

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
dried_indices_targets

[1,
 3,
 4,
 5,
 6,
 11,
 32,
 38,
 40,
 134,
 147,
 148,
 149,
 151,
 156,
 157,
 158,
 159,
 165,
 166,
 167,
 169,
 170,
 171,
 172,
 174,
 176,
 180,
 182,
 224,
 269,
 329]

In [9]:
len(dried_indices_targets)

32

In [23]:
_, dried_indices_features = DataCleaner.remove_outliers(dried_df, method="features",feature_outlier_threshold=99)



In [24]:
dried_indices_features

[35, 165, 167, 233, 238]

In [25]:
len(dried_indices_features)

5

In [28]:
_, dried_indices_both = DataCleaner.remove_outliers(dried_df, method="both", feature_outlier_threshold=99)



In [29]:
dried_indices_both

[1,
 3,
 4,
 5,
 6,
 134,
 329,
 11,
 269,
 147,
 148,
 149,
 151,
 156,
 157,
 158,
 159,
 32,
 224,
 35,
 38,
 166,
 233,
 40,
 169,
 170,
 171,
 238,
 172,
 174,
 176,
 180,
 182]

In [30]:
len(dried_indices_both)

33

In [15]:
len(dried_df)

385

In [16]:
print(40*100/385)

10.38961038961039


In [33]:
_, fresh_indices_targets = DataCleaner.remove_outliers(fresh_df, method="targets")

In [34]:
len(fresh_indices_targets)

26

In [35]:
_, fresh_indices_features = DataCleaner.remove_outliers(fresh_df, method="features")



In [36]:
len(fresh_indices_features)

16

In [37]:
_, fresh_indices_both = DataCleaner.remove_outliers(fresh_df, method="both")



In [38]:
len(fresh_indices_both)

34