In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Ridge

In [None]:
ALL_TARGETS = ["Ca", "P", "pH", "SOC", "Sand"]

def read_data():
    train = pd.read_csv("/home/daniel/kaggle/afsis-soil-properties/input/training.csv")
    test = pd.read_csv("/home/daniel/kaggle/afsis-soil-properties/input/sorted_test.csv")
    submission = pd.read_csv("/home/daniel/kaggle/afsis-soil-properties/input/sample_submission.csv")
    train, test = _preprocess(train, test)
    return train, test, submission

def _preprocess(train, test):
    train.drop("PIDN", axis=1, inplace=True)
    test.drop("PIDN", axis=1, inplace=True)
    return train, test

def make_test_set(test):
    # Spectral features only
    X_test = test.copy()
    return X_test.iloc[:, :3578]

def make_train_set(train, target):
    X_train = train.copy()
    X_train.drop(ALL_TARGETS, axis=1, inplace=True)
    y_train = train[target]
    # Spectral Features only
    X_train = X_train.iloc[:, :3578]
    return X_train, y_train

def run_pipeline(scaler, X_train, y_train, X_test):
    ridge_pipe = Pipeline([('Scaler', scaler), ('Ridge' , Ridge())])
    ridge_pipe.fit(X_train, y_train)
    return ridge_pipe.predict(X_test)

if __name__ == "__main__":
    #Defining the size of the grid
    print("Starting loading data")
    train, test, submission = read_data()
    print("Finished loading data")
    X_test = make_test_set(test)

    predictions = {}

    sub_minmax = submission.copy()
    sub_standard = submission.copy()

    for i in range(len(ALL_TARGETS)):
        target = ALL_TARGETS[i]
        print(f"Target {target}")

        X_train, y_train = make_train_set(train, target)
        if target == "P":
            y_train = np.log(y_train+1)
        ridge = Ridge()
        ridge.fit(X_train, y_train)
        predictions = ridge.predict(X_test)
        if target == "P":
            submission[target] = np.exp(predictions) - 1
        else:
            submission[target] = predictions
        standard_scaler = StandardScaler()
        minmax_scaler = MinMaxScaler()
        
        standard_predictions = run_pipeline(standard_scaler, X_train, y_train, X_test)
        minmax_predictions = run_pipeline(minmax_scaler, X_train, y_train, X_test)

        if target == "P":
            sub_standard[target] = np.exp(standard_predictions) - 1
            sub_minmax[target] = np.exp(minmax_predictions) - 1
        else:
            sub_standard[target] = standard_predictions
            sub_minmax[target] = minmax_predictions        
            
    submission.to_csv("ridge_submission_logp.csv", index=False)
    sub_standard.to_csv("standard_ridge_submission_logp.csv", index=False)
    sub_minmax.to_csv("minmax_ridge_submission_logp.csv", index=False)

# SVR

In [None]:
if __name__ == "__main__":
    #Defining the size of the grid
    print("Starting loading data")
    train, test, submission = read_data()
    print("Finished loading data")
    X_test = make_test_set(test)

    predictions = {}

    sub_minmax = submission.copy()
    sub_standard = submission.copy()

    for i in range(len(ALL_TARGETS)):
        target = ALL_TARGETS[i]
        print(f"Target {target}")

        X_train, y_train = make_train_set(train, target)
        svr = SVR(C=10000)
        svr.fit(X_train, y_train)
        predictions = svr.predict(X_test)
        submission[target] = predictions
    submission.to_csv("svr_submission.csv", index=False)

# MLP

In [None]:
tensorflow.random.set_seed(3)

def make_model(input_shape):
    # Create the model
    model = Sequential()
    model.add(Dense(512, input_shape=input_shape, activation='relu'))
#     model.add(Dropout(0.05))
    model.add(Dense(256, activation='relu'))
#     model.add(Dropout(0.05))
    model.add(Dense(1, activation='linear'))
    # Configure the model and start training
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])
    return model


#Defining the size of the grid
print("Starting loading data")
train, test, submission = read_data()
print("Finished loading data")
X_test = make_test_set(test)

predictions = {}
histories = {}

for i in range(len(ALL_TARGETS)):
    target = ALL_TARGETS[i]
    print(f"Target {target}")
    X_train, y_train = make_train_set(train, target)
    input_shape = (X_train.shape[1], )
    if target == "P":
        y_train = np.log(y_train+1)
    scaler = StandardScaler()
    scaler.fit(X_train)
    scaler.transform(X_train)
    model = make_model(input_shape)
    callback = tensorflow.keras.callbacks.EarlyStopping(monitor='loss', patience=50)
    history = model.fit(X_train, y_train, epochs=500, batch_size=64,
              verbose=1, validation_split=0.2, callbacks=[callback])
    histories[target] = history
    scaler.transform(X_test)
    predictions = model.predict(X_test)
    if target == "P":
        submission[target] = np.exp(predictions) - 1
    else:
        submission[target] = predictions
submission.to_csv("std_mlp_500epochs_L1-512_L2_256_bs64.csv", index=False)

# Ensemble

In [None]:
ALL_TARGETS = ["Ca", "P", "pH", "SOC", "Sand"]

svr = pd.read_csv("svr_submission.csv")
ridge = pd.read_csv("standard_ridge_submission_logp.csv")
mlp = pd.read_csv("std_mlp_500epochs_L1-512_L2_256_bs64.csv")
# lgbm = pd.read_csv("tuned_lgbm_spectral_features_n_calls_500.csv")

In [None]:
tmp = svr[ALL_TARGETS]*0.4 + ridge[ALL_TARGETS]*0.4 + mlp[ALL_TARGETS]*0.2
sub = svr.copy()
sub[ALL_TARGETS] = tmp[ALL_TARGETS]
sub.to_csv("sub_ens-04svr+04ridge+02mlp.csv", index=False)