In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [25]:
df=pd.read_excel('../LUCAS-SOIL-2018.xls')
target = 'LU1_Desc'

In [26]:
#First filter the points only for Sweden
#df = df[df['NUTS_0']=='SE']
df.shape

(18984, 27)

In [27]:
for col in ['P', 'N', 'K']:
    df.loc[df[col]=='< LOD', col] = np.nan
    df.loc[df[col]=='<0.0', col] = np.nan
    df[col]=df[col].astype(float)
for col in ['OC','CaCO3','OC (20-30 cm)','CaCO3 (20-30 cm)']:
    df.loc[df[col]=='< LOD', col] = np.nan
    df.loc[df[col]=='<  LOD', col] = np.nan
    df.loc[df[col]=='<0.0', col] = np.nan
    df[col]=df[col].astype(float)

for col in ['NUTS_0','NUTS_1','NUTS_2','NUTS_3','LC','LU','LC0_Desc','LC1_Desc','LU1_Desc']:
    df[col]=df[col].astype('category')
cols = df.columns
for c in cols:
    if df[c].dtype == 'object':
        df[c] = df[c].str.strip()

In [28]:
df['SURVEY_DATE']=pd.to_datetime(df['SURVEY_DATE'], format='%d/%m/%y')

In [29]:
cols = df.columns.tolist()
for index, c in enumerate(cols):
    cols[index] = c.replace(' ', '_')
df.columns = cols

In [30]:
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named df
df_encoded = df.copy()  # Create a copy to avoid modifying the original DataFrame

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through columns and apply label encoding
for column in df_encoded.columns:
    if df_encoded[column].dtype.name == 'category':
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])

In [31]:
label_encoder.fit(df[target])
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(le_name_mapping)

{'Abandoned industrial areas': 0, 'Abandoned residential areas': 1, 'Abandoned transport areas': 2, 'Agriculture (excluding fallow land and kitchen gardens)': 3, 'Amenities, museum, leisure (e.g. parks, botanical gardens)': 4, 'Commerce': 5, 'Community services': 6, 'Construction': 7, 'Electricity, gas and thermal power distribution': 8, 'Energy production': 9, 'Fallow land': 10, 'Financial, professional and information services': 11, 'Forestry': 12, 'Kitchen gardens': 13, 'Logistics and storage': 14, 'Mining and quarrying': 15, 'Other abandoned areas': 16, 'Other primary production': 17, 'Protection infrastructures': 18, 'Railway transport': 19, 'Residential': 20, 'Road transport': 21, 'Semi-natural and natural areas not in use': 22, 'Sport': 23, 'Water supply and treatment': 24, 'Water transport': 25}


In [32]:
# List of columns to drop
columns_to_drop = ['OC_(20-30_cm)', 'CaCO3_(20-30_cm)', 'Ox_Al', 'Ox_Fe']

# Dropping the specified columns
df_encoded = df_encoded.drop(columns=columns_to_drop)

In [33]:
import numpy as np

# Assuming df_encoded is your DataFrame
# Replace NaN values with 0 in the 'P' column
df_encoded['P'] = df_encoded['P'].replace(np.nan, 0).astype(float)

In [34]:
df_encoded['CaCO3'] = df_encoded['CaCO3'].replace(np.nan, 0).astype(float)

In [35]:
print(df_encoded.shape)
df_encoded=df_encoded.dropna()
print(df_encoded.shape)

(18984, 23)
(18900, 23)


In [36]:
df_encoded['LC0_Desc']=df_encoded['LC0_Desc'].astype(float)
print(df_encoded.dtypes)

Depth                  object
POINTID                 int64
pH_CaCl2              float64
pH_H2O                float64
EC                    float64
OC                    float64
CaCO3                 float64
P                     float64
N                     float64
K                     float64
NUTS_0                  int32
NUTS_1                  int32
NUTS_2                  int32
NUTS_3                  int32
TH_LAT                float64
TH_LONG               float64
SURVEY_DATE    datetime64[ns]
Elev                    int64
LC                      int32
LU                      int32
LC0_Desc              float64
LC1_Desc                int32
LU1_Desc                int32
dtype: object


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



# Selecting features and target variable
#features = ['pH_CaCl2', 'pH_H2O', 'EC', 'OC', 'CaCO3', 'P', 'N', 'K']

# Use LC0_Desc for predicting LU1_Desc
features = ['pH_CaCl2', 'pH_H2O', 'EC', 'OC', 'CaCO3', 'P', 'N', 'K', 'LC0_Desc']
X = df_encoded[features]
y = df_encoded[target]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initializing XGBoost classifier
#model = XGBClassifier()
#model = RandomForestClassifier()


# Training the model
#model.fit(X_train, y_train)

#Create pipe and fit
pipe = Pipeline([('scaler', StandardScaler()),
                ('clr', RandomForestClassifier())])

pipe.fit(X_train, y_train)
# Making predictions on the test set
#predictions = model.predict(X_test)
predictions = pipe.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')
print('\nClassification Report:\n', classification_report(y_test, predictions))


In [None]:
#saving the built model
#model.save_model(f'{target}.model')

# import joblib
# joblib.dump(pipe,f'{target}.joblib')


In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
from skl2onnx import to_onnx

# Disable zipmap as it is not supported in BigQuery ML.
options = {id(pipe): {'zipmap': False}}

initial_types = [
   ('pH_CaCl2', FloatTensorType([None, 1])),
   ('pH_H2O', FloatTensorType([None, 1])),
   ('EC', FloatTensorType([None, 1])),
   ('OC', FloatTensorType([None, 1])),
   ('CaCO3', FloatTensorType([None, 1])),
   ('P', FloatTensorType([None, 1])),
   ('N', FloatTensorType([None, 1])),
   ('K', FloatTensorType([None, 1])),
   ('LC0_Desc', FloatTensorType([None, 1]))   
]

# Convert the model.
model_onnx = convert_sklearn(
   pipe, 'pipeline_rf', initial_types=initial_types, options=options
)
onnxFileName = f'{target}_ONNX.onnx'
onnxCompressedFileName = f'{target}_ONNX.zip'
with open(onnxFileName, "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
from zipfile import *
with open(onnxFileName, "wb") as f:
    f.write(model_onnx.SerializeToString())
ZipFile(onnxCompressedFileName, "w", ZIP_DEFLATED).write(onnxFileName)