In [None]:
# Julia setup credit: @marketneutral, https://www.kaggle.com/marketneutral/julia-live-on-kaggle
# Julia version of baseline notebook: https://www.kaggle.com/junhyeok99/catboost-baseline
from datetime import datetime
import numpy as np
from IPython.display import Image, display

# Install Julia

In [None]:
%%bash

JULIA_VERSION="1.7.1"
JULIA_PACKAGES="""DataFrames DataFramesMeta CSV MLLabelUtils MLJ EvoTrees
                  MLJXGBoostInterface NearestNeighborModels 
                  MLJLinearModels LightGBM
                """

JULIA_VER=`cut -d '.' -f -2 <<< "$JULIA_VERSION"`
BASE_URL="https://julialang-s3.julialang.org/bin/linux/x64"
URL="$BASE_URL/$JULIA_VER/julia-$JULIA_VERSION-linux-x86_64.tar.gz"
wget -nv $URL -O /tmp/julia.tar.gz
tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
rm /tmp/julia.tar.gz
for PKG in `echo $JULIA_PACKAGES`; do
    echo "Installing Julia package $PKG..."
    julia -e 'using Pkg; pkg"add '$PKG'"'
done

In [None]:
# we set the environment variable for number of threads

import os
os.environ['JULIA_NUM_THREADS'] = "4"

# Run Julia Notebook in Cells

In [None]:
%%time
!pip install --quiet julia
import julia
from julia.api import Julia
julia.install()
jl = Julia(compiled_modules=False)  # cannot use precompiled packages with pyjulia on linux :-(
# https://pyjulia.readthedocs.io/en/latest/troubleshooting.html
%load_ext julia.magic

In [None]:
%%julia

using Base.Threads
println(nthreads())

# Run Julia Code

In [None]:
%%julia
using Random
Random.seed!(42);

In [None]:
%%julia 
using CSV
using DataFrames
using DataFramesMeta
using Dates
using MLLabelUtils
using MLJ

In [None]:
%%julia
# Import data
input_dir = "../input/tabular-playground-series-jan-2022"
df_train = CSV.File("$input_dir/train.csv") |> DataFrame
df_test = CSV.File("$input_dir/test.csv") |> DataFrame
first(df_train, 5)

In [None]:
%%julia
# preprocess data
df_all = vcat(df_train, df_test, cols=:union)

df_all[:, "year"] = year.(df_all[:, "date"])
df_all[:, "month"] = month.(df_all[:, "date"])
df_all[:, "day"] = day.(df_all[:, "date"])
df_all[:, "dayofweek"] = dayofweek.(df_all[:, "date"])
df_all[:, "dayofmonth"] = dayofmonth.(df_all[:, "date"])
df_all[:, "dayofyear"] = dayofyear.(df_all[:, "date"])

select!(df_all, Not(["row_id", "date", "num_sold"]));

for col in [:country, :product, :store]
    classes = length(unique(df_all[:, col]))
    labels = convertlabel(LabelEnc.Indices(Float32, classes), df_all[:, col])
    @transform!(df_all, Col = labels)
end
    
n_train = size(df_train, 1)
x_train = df_all[1:n_train, :]
x_test = df_all[n_train+1:end, :]
y_train = df_train[:, "num_sold"]

first(x_train, 5)

In [None]:
%%julia
# load models
# This is extra slow in Kaggle, because none of the packages are precompiled 
EvoTreeRegressor = @load EvoTreeRegressor
XGBoostRegressor = @load XGBoostRegressor
LGBMRegressor = @load LGBMRegressor
KNNRegressor = @load KNNRegressor pkg=NearestNeighborModels
RidgeRegressor = @load RidgeRegressor pkg=MLJLinearModels
ElasticNetRegressor = @load ElasticNetRegressor pkg=MLJLinearModels

stack = Stack(;metalearner=RidgeRegressor(lambda=0.1),
                resampling=CV(),
                constant=ConstantRegressor(),
                elastic=ElasticNetRegressor(),
                knn=KNNRegressor(),
                evo=EvoTreeRegressor(),
                xgb=XGBoostRegressor(),
                lgb=LGBMRegressor())

# fit model
mach = machine(stack, x_train, y_train)
fit!(mach; verbosity=0)

# make predictions on test data
preds = predict(mach, x_test)

In [None]:
%%julia
# Submit predictions
df_sub = CSV.File("$input_dir/sample_submission.csv") |> DataFrame
select!(df_sub, Not("num_sold")) # drop sample num_sold 
df_sub[:, "num_sold"] = preds
CSV.write("submission.csv", df_sub)