In [61]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from scipy.stats import norm


In [56]:
# Load dataset
file_path = 'dataset/agriculture_dataset.csv'
data = pd.read_csv(file_path)

# Display first few rows of the dataset to understand its structure
# print(data.head())



## Missing Value Handling

In [59]:
# Display first few rows of the dataset to understand its structure
print(data.head())

# Identify numeric and non-numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns.tolist()
non_numeric_columns = data.select_dtypes(exclude=['number']).columns.tolist()

# Column containing N2O measurements (replace 'N2O' with the actual column name)
n2o_column = 'N2O'  # Update this to the correct column name if necessary

# Handle missing values
# Impute numeric columns with mean or median
numeric_imputer = SimpleImputer(strategy='mean')  # Change to 'median' if preferred
data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])

# Impute non-numeric columns with the most frequent value
if non_numeric_columns:
    non_numeric_imputer = SimpleImputer(strategy='most_frequent')
    data[non_numeric_columns] = non_numeric_imputer.fit_transform(data[non_numeric_columns])

# Define outlier detection methods


      Date    Year Experiment   DataUse Replication     Month Vegetation  \
0   2/9/12  2012.0   BCSE_KBS  Building          R1  February       Corn   
1  2/10/12  2012.0   BCSE_KBS  Building          R1  February       Corn   
2  2/18/12  2012.0   BCSE_KBS  Building          R1  February       Corn   
3  2/19/12  2012.0   BCSE_KBS  Building          R1  February       Corn   
4  3/16/12  2012.0   BCSE_KBS  Building          R1     March       Corn   

  VegType       N2O  N_rate  ...   PP7  AirT  DAF_TD  DAF_SD  WFPS25cm  \
0  Annual  3.896742   170.0  ...  0.00  -2.0   276.0   241.0  0.666508   
1  Annual  2.190218   170.0  ...  0.00  -2.4   277.0   242.0  0.640608   
2  Annual  3.542594   170.0  ...  8.64   0.3   285.0   250.0  0.728085   
3  Annual  3.342870   170.0  ...  8.13  -3.8   286.0   251.0  0.686872   
4  Annual  2.947778   170.0  ...  8.39  17.6   312.0   277.0  0.716221   

         NH4        NO3  Clay   Sand       SOM  
0  11.046340  22.940812  62.5  637.5  1.174072  


In [65]:
# IQR & ZScore
def iqr_zscore_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered

# Isolation Forest
def isolation_forest_outliers(df, columns):
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    outliers = iso_forest.fit_predict(df[columns])
    df_filtered = df[outliers == 1]
    return df_filtered

# DBSCAN
def dbscan_outliers(df, columns):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[columns])
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    outliers = dbscan.fit_predict(df_scaled)
    df_filtered = df[outliers != -1]
    return df_filtered

# OneClass SVM
def oneclass_svm_outliers(df, columns):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[columns])
    oc_svm = OneClassSVM(nu=0.1)
    outliers = oc_svm.fit_predict(df_scaled)
    df_filtered = df[outliers == 1]
    return df_filtered

# Naive Bayes
def naive_bayes_outliers(df, columns, target):
    nb = GaussianNB()
    nb.fit(df[columns], df[target])
    outlier_prob = nb.predict_proba(df[columns])[:, 1]
    df_filtered = df[outlier_prob >= 0.1]  # Threshold for outliers
    return df_filtered

# Local Outlier Factor (LOF)
def lof_outliers(df, columns):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    outliers = lof.fit_predict(df[columns])
    df_filtered = df[outliers == 1]
    return df_filtered

# List of columns used for outlier detection (excluding the target)
outlier_detection_columns = numeric_columns.copy()
outlier_detection_columns.remove(n2o_column)

# Apply each outlier detection method and save the result
methods = {
    'iqr_zscore': iqr_zscore_outliers,
    'isolation_forest': isolation_forest_outliers,
    'dbscan': dbscan_outliers,
    'oneclass_svm': oneclass_svm_outliers,
    'naive_bayes': naive_bayes_outliers,
    'lof': lof_outliers
}

for method_name, method_func in methods.items():
    if method_name == 'naive_bayes':
        df_filtered = method_func(data, outlier_detection_columns, n2o_column)
    else:
        df_filtered = method_func(data, outlier_detection_columns)
    
    # Save the filtered dataframe to a CSV file
    output_path = f'hasil/{method_name}_filtered.csv'
    df_filtered.to_csv(output_path, index=False)
    print(f"Saved {method_name} filtered data to {output_path}")

Saved iqr_zscore filtered data to hasil/iqr_zscore_filtered.csv
Saved isolation_forest filtered data to hasil/isolation_forest_filtered.csv
Saved dbscan filtered data to hasil/dbscan_filtered.csv
Saved oneclass_svm filtered data to hasil/oneclass_svm_filtered.csv


ValueError: Unknown label type: (array([ -7.4152966 ,  -5.22679564,  -5.09184177, ..., 324.265     ,
       583.149     , 593.072     ]),)