## Juliaでpythonのパッケージ(sklearn)を動かしてみる

参考: https://qiita.com/Ag_smith/items/0352890b8cf3420765d0  
https://qiita.com/Miyazaki-Yu/items/51d1e29b1df5ce29bba4  
https://gist.github.com/terasakisatoshi/23b7b69af2893691a992fb7263e185c9  
https://twitter.com/genkuroki/status/1307386185942069248?s=20&t=IQHHqCVKwj0EseVkjN_MdA  
https://qiita.com/Ken-Kuroki/items/115d0679fdce1d15b9d1  

In [1]:
# 初回のインストール
using Pkg
Pkg.add("PyCall")
Pkg.add("Conda")
Pkg.add("Pandas")
Pkg.add("DataFrames")
Pkg.add("CSV")
Pkg.add("Shuffle")
Pkg.add("Metrics")

using PyCall
using Conda
Conda.update()
Conda.add("numpy")
Conda.add("scikit-learn")
Conda.add("pandas")

ENV["PYTHON"] = ENV["HOME"] * "/.julia/conda/3/bin/python3.10"
Pkg.build("PyCall")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.8/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.ju

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /Users/smamitsu/.julia/conda/3

  added / updated specs:
    - conda


The following packages will be REMOVED:

  ninja-1.11.0-hf86a087_0


Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y numpy` in root environment


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y scikit-learn` in root environment


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y pandas` in root environment


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/6e47d11ea2776bc5627421d59cdcc1296c058071/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/62f417f6ad727987c755549e9cd88c46578da562/build.log`


In [1]:
using PyCall
using CSV
using DataFrames
using Pandas: Pandas, iloc
using Shuffle
using Metrics

### Juliaで読み込んで最小限のところだけpythonにやらせるやり方

In [3]:
data_path = "../adult.csv"
jldf = CSV.read(data_path, DataFrame);

In [4]:
# train_test_splitの関数
function partitionTrainTest(data, at = 0.7)
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    data[train_idx,:], data[test_idx,:]
end
train,test = partitionTrainTest(jldf)
typeof(train)


DataFrame

In [5]:
# Julia側で分割して、pandasに変換する
X_train = train[:,1:end-1]; y_train = train[:,end]
X_test = test[:,1:end-1]; y_test = test[:,end]

# Pandas.jlの型はそのままPyObject(pandas.DataFrame)に渡すことができる。
X_train_py = Pandas.DataFrame(X_train); y_train_py = Pandas.DataFrame(y_train)
X_test_py = Pandas.DataFrame(X_test); y_test_py = Pandas.DataFrame(y_test)
typeof(X_train_py)  # JuliaのPandas.DataFrameのオブジェクトになるが、これはpythonのpandasに渡すことができる。

Pandas.DataFrame

In [6]:
# pythonに渡してfitさせる
py"""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

def fit(X,y):
    gbc = GradientBoostingClassifier()
    gbc.fit(X,y)
    return gbc
"""

gbc = py"fit"(X_train_py, y_train_py)

  y = column_or_1d(y, warn=True)


In [7]:
# 予測
py"""
def predict(gbc,X):
    y = gbc.predict(X)
    return y
"""

y_pred = py"predict"(gbc, X_test_py)
typeof(y_pred)   # 受け取りはJuliaのオブジェクトになる。

Vector{Int64}[90m (alias for [39m[90mArray{Int64, 1}[39m[90m)[39m

In [8]:
# スコアの算出
Metrics.binary_accuracy(y_test,y_pred)

0.8710161741622876

### pythonのpandasで読み込んで、処理をする

In [9]:
# pyimportしたpandasでPyObjectとして読み込む
pandas = pyimport("pandas")
pydf = pandas.read_csv(data_path)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39.0,7,77516.0,9,13.0,4,1,1,4,1,2174.0,0.0,40.0,39,1
1,50.0,6,83311.0,9,13.0,2,4,0,4,1,0.0,0.0,13.0,39,1
2,38.0,4,215646.0,11,9.0,0,6,1,4,1,0.0,0.0,40.0,39,1
3,53.0,4,234721.0,1,7.0,2,6,0,2,1,0.0,0.0,40.0,39,1
4,28.0,4,338409.0,9,13.0,2,10,5,2,0,0.0,0.0,40.0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39.0,4,215419.0,9,13.0,0,10,1,4,0,0.0,0.0,36.0,39,1
48838,64.0,0,321403.0,11,9.0,6,0,2,2,1,0.0,0.0,40.0,39,1
48839,38.0,4,374983.0,9,13.0,2,10,0,4,1,0.0,0.0,50.0,39,1
48840,44.0,4,83891.0,9,13.0,0,1,3,1,1,5455.0,0.0,40.0,39,1


In [10]:
typeof(pydf)

PyObject

In [11]:
# pythonでtrain_test_splitさせる
py"""
from sklearn.model_selection import train_test_split
"""
train,test = py"train_test_split"(pydf,test_size=0.7)
py"type"(train)

PyObject <class 'pandas.core.frame.DataFrame'>

In [12]:
# trainはPyObject
# <pandas.DataFrame>.iloc[:,:-1]みたいなコードは通らない。.iloc[1,1]とかならいける
# https://htmlview.glitch.me/?https://gist.githubusercontent.com/terasakisatoshi/2fc36b7eb74279172f4ffe1d929637fa/raw/083d88a1ebbe0574b923285bae5bb29cbcc06062/pandas_nanmo_wakaran.jl
# 1:3みたいな範囲なら上の記事のやり方で処理できる
py"""
import pandas as pd
def X_y(data):
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return X,y
"""
X_train, y_train = py"X_y"(train)
X_test, y_test = py"X_y"(test)
py"type"(X_train)

PyObject <class 'pandas.core.frame.DataFrame'>

In [14]:
# shapeは普通に使える。ilocがバグるだけ。
_,col_num = pydf.shape
# pandas.jlでこんな感じにできる
X_train_pd = iloc(Pandas.DataFrame(train))[:,1:col_num-1]
y_train_pd = iloc(Pandas.DataFrame(train))[:,col_num]
X_test_pd = iloc(Pandas.DataFrame(test))[:,1:col_num-1]
y_test_pd = iloc(Pandas.DataFrame(test))[:,col_num]
typeof(X_train_pd)  # Pandas.jlのオブジェクト

Pandas.DataFrame

In [16]:
py"""
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

def fit_predict_score(X_train,y_train,X_test,y_test):
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train,y_train)
    y_pred = gbc.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    return score
"""
score = py"fit_predict_score"(X_train,y_train,X_test,y_test)

0.8642878034513015

In [17]:
score_pd = py"fit_predict_score"(X_train_pd, y_train_pd, X_test_pd, y_test_pd)

0.8643170517695232

In [23]:
# ちなみに、python側に値を渡したいときは、$xで渡せる
pyimport("sys").stdout.flush()  # pythonのprintをjupyter上に表示させる設定。スクリプトなら必要なし。
py"""
reverse_score = 1 - $score_pd
print(reverse_score)
"""

0.13568294823047677
