In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the dataset
file_path = 'dataset/agriculture_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset for review
# data.head(),data.info(),data.describe()

In [3]:

missing_values_count = data.isna().sum()
missing_values_count

Date            0
Year            0
Experiment      0
DataUse         0
Replication     0
Month           0
Vegetation      0
VegType         0
N2O             0
N_rate          0
PP2             0
PP7             0
AirT            0
DAF_TD          0
DAF_SD          0
WFPS25cm       52
NH4            76
NO3            30
Clay            0
Sand            0
SOM             0
dtype: int64

Using the median instead of the mean for NH4 and NO3 might be more suitable because these columns exhibit skewness. If you'd like to update your script for better accuracy, you can modify it as follows:

In [4]:
# Impute missing values with the mean for WFPS25cm, and median for NH4 and NO3
data['WFPS25cm'].fillna(data['WFPS25cm'].mean(), inplace=True)  # Mean imputation
data['NH4'].fillna(data['NH4'].median(), inplace=True)  # Median imputation
data['NO3'].fillna(data['NO3'].median(), inplace=True)  # Median imputation

# Verify that there are no missing values left
missing_values_after_imputation = data.isna().sum()
missing_values_after_imputation

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['WFPS25cm'].fillna(data['WFPS25cm'].mean(), inplace=True)  # Mean imputation
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['NH4'].fillna(data['NH4'].median(), inplace=True)  # Median imputation
The behavior will change in pandas 3.0. This inplace method will never wor

Date           0
Year           0
Experiment     0
DataUse        0
Replication    0
Month          0
Vegetation     0
VegType        0
N2O            0
N_rate         0
PP2            0
PP7            0
AirT           0
DAF_TD         0
DAF_SD         0
WFPS25cm       0
NH4            0
NO3            0
Clay           0
Sand           0
SOM            0
dtype: int64

In [6]:
# Save the cleaned dataset to a new CSV file
output_file_path = 'hasil/BeforeRemoveOutliers/010_08_001.input_NaN_MEANMEDIAN_agriculture_dataset.csv'
data.to_csv(output_file_path, index=False)

output_file_path

'hasil/BeforeRemoveOutliers/010_08_001.input_NaN_MEANMEDIAN_agriculture_dataset.csv'

### StandardScaler or MinMax

In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Define the columns that need MinMaxScaler and StandardScaler
minmax_columns = ['N2O', 'NH4', 'NO3', 'N_rate', 'PP2', 'PP7']
standard_columns = ['WFPS25cm', 'AirT']

# Initialize the scalers
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Apply MinMaxScaler to the selected columns
data[minmax_columns] = minmax_scaler.fit_transform(data[minmax_columns])

# Apply StandardScaler to the selected columns
data[standard_columns] = standard_scaler.fit_transform(data[standard_columns])

# Display the first few rows of the scaled dataset to verify the changes
data.head()

# Save the scaled dataset to a new CSV file
data.to_csv('hasil/BeforeRemoveOutliers/010_08_002.scaler_agriculture_dataset.csv', index=False)


In [11]:
# Display the head of the scaled dataset
scaled_data_head = data.head()
scaled_data_head

Unnamed: 0,Date,Year,Experiment,DataUse,Replication,Month,Vegetation,VegType,N2O,N_rate,...,PP7,AirT,DAF_TD,DAF_SD,WFPS25cm,NH4,NO3,Clay,Sand,SOM
0,2/9/12,2012,BCSE_KBS,Building,R1,February,Corn,Annual,0.018838,0.798122,...,0.0,-1.518617,276,241,0.797637,0.042542,0.094616,62.5,637.5,1.174072
1,2/10/12,2012,BCSE_KBS,Building,R1,February,Corn,Annual,0.015996,0.798122,...,0.0,-1.560776,277,242,0.645128,0.042373,0.094695,62.5,637.5,1.174072
2,2/18/12,2012,BCSE_KBS,Building,R1,February,Corn,Annual,0.018248,0.798122,...,0.033219,-1.276207,285,250,1.160223,0.041596,0.095795,62.5,637.5,1.174072
3,2/19/12,2012,BCSE_KBS,Building,R1,February,Corn,Annual,0.017916,0.798122,...,0.031258,-1.70833,286,251,0.917547,0.041676,0.096005,62.5,637.5,1.174072
4,3/16/12,2012,BCSE_KBS,Building,R1,March,Corn,Annual,0.017258,0.798122,...,0.032258,0.547142,312,277,1.090364,0.038836,0.099924,62.5,637.5,1.174072
