In [8]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from scipy import stats

# Load dataset
file_path = "dataset/agriculture_dataset.csv"
df = pd.read_csv(file_path)


In [11]:

# Preprocessing function to handle missing values and scale data
def preprocess_data(df, columns):
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')  # Use 'median' if needed
    df[columns] = imputer.fit_transform(df[columns])
    
    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[columns])
    
    return scaled_data

# Function to detect outliers using IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    outlier_condition = (df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))
    df['IQR_outlier'] = np.where(outlier_condition, 1, 0)

# Function to detect outliers using Z-Score
def detect_outliers_zscore(df, column):
    z_scores = stats.zscore(df[column])
    df['ZScore_outlier'] = np.where(np.abs(z_scores) > 3, 1, 0)

# Function to detect outliers using Isolation Forest
def detect_outliers_isolation_forest(df, data):
    iso_forest = IsolationForest(contamination=0.05)
    df['IF_outlier'] = iso_forest.fit_predict(data)

# Function to detect outliers using DBSCAN
def detect_outliers_dbscan(df, data):
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    labels = dbscan.fit_predict(data)
    df['DBSCAN_outlier'] = np.where(labels == -1, 1, 0)

# Function to detect outliers using One-Class SVM
def detect_outliers_oneclass_svm(df, data):
    svm = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
    df['OneClassSVM_outlier'] = svm.fit_predict(data)

# Function to detect outliers using Local Outlier Factor
def detect_outliers_lof(df, data):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df['LOF_outlier'] = lof.fit_predict(data)


In [None]:

# Columns to use for outlier detection (selecting numerical columns)
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Preprocess the data
scaled_data = preprocess_data(df, numerical_columns)

# Apply outlier detection methods
detect_outliers_iqr(df, 'N2O')
detect_outliers_zscore(df, 'N2O')
detect_outliers_isolation_forest(df, scaled_data)
detect_outliers_dbscan(df, scaled_data)
detect_outliers_oneclass_svm(df, scaled_data)
detect_outliers_lof(df, scaled_data)

In [12]:

# Columns to use for outlier detection (selecting numerical columns)
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Preprocess the data
scaled_data = preprocess_data(df, numerical_columns)

# Apply outlier detection methods
detect_outliers_iqr(df, 'N2O')
detect_outliers_zscore(df, 'N2O')
detect_outliers_isolation_forest(df, scaled_data)
detect_outliers_dbscan(df, scaled_data)
detect_outliers_oneclass_svm(df, scaled_data)
detect_outliers_lof(df, scaled_data)

ValueError: could not convert string to float: '2/9/12'

In [6]:
df

Unnamed: 0,Date,Year,Experiment,DataUse,Replication,Month,Vegetation,VegType,N2O,N_rate,...,NO3,Clay,Sand,SOM,IQR_outlier,ZScore_outlier,IF_outlier,DBSCAN_outlier,OneClassSVM_outlier,LOF_outlier
0,2/9/12,2012.0,BCSE_KBS,Building,R1,February,Corn,Annual,3.896742,170.0,...,22.940812,62.500000,637.500000,1.174072,0,0,1,1,1,1
1,2/10/12,2012.0,BCSE_KBS,Building,R1,February,Corn,Annual,2.190218,170.0,...,22.959578,62.500000,637.500000,1.174072,0,0,1,0,1,1
2,2/18/12,2012.0,BCSE_KBS,Building,R1,February,Corn,Annual,3.542594,170.0,...,23.221928,62.500000,637.500000,1.174072,0,0,1,0,1,1
3,2/19/12,2012.0,BCSE_KBS,Building,R1,February,Corn,Annual,3.342870,170.0,...,23.271978,62.500000,637.500000,1.174072,0,0,1,0,1,1
4,3/16/12,2012.0,BCSE_KBS,Building,R1,March,Corn,Annual,2.947778,170.0,...,24.206855,62.500000,637.500000,1.174072,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2241,10/16/2013,2013.0,MCSE-T2,Testing,R4,October,TRIAE,Annual,1.640000,91.0,...,5.803681,128.333333,491.666667,1.806000,0,0,1,1,1,1
2242,11/15/2013,2013.0,MCSE-T2,Testing,R1,November,TRIAE,Annual,0.990000,91.0,...,5.429873,162.500000,490.375000,1.496400,0,0,1,1,1,1
2243,11/15/2013,2013.0,MCSE-T2,Testing,R2,November,TRIAE,Annual,-0.200000,91.0,...,7.239000,183.250000,432.875000,1.186800,0,0,1,1,1,1
2244,11/15/2013,2013.0,MCSE-T2,Testing,R3,November,TRIAE,Annual,0.300000,91.0,...,8.434294,169.833333,418.833333,1.823200,0,0,1,1,1,1


In [7]:
# Summarize the count of outliers and inliers for each method

def summarize_outliers(df, method_columns):
    summary = []
    for column in method_columns:
        outliers = df[column].value_counts().get(1, 0)
        inliers = df[column].value_counts().get(-1, 0) if df[column].min() == -1 else df[column].value_counts().get(0, 0)
        total = len(df[column])
        summary.append({
            'Method': column.replace('_outlier', ''),
            'Outliers': outliers,
            'Inliers': inliers,
            'Total': total
        })
    return pd.DataFrame(summary)

# List of outlier detection columns
outlier_columns = [col for col in df.columns if col.endswith('_outlier')]

# Generate the summary
summary_df = summarize_outliers(df, outlier_columns)

# import ace_tools as tools; tools.display_dataframe_to_user(name="Outlier Detection Summary", dataframe=summary_df)

summary_df


Unnamed: 0,Method,Outliers,Inliers,Total
0,IQR,251,1995,2246
1,ZScore,32,2214,2246
2,IF,2133,113,2246
3,DBSCAN,1697,549,2246
4,OneClassSVM,2134,112,2246
5,LOF,2133,113,2246
