### 実験
- データセット：Google Cloud Storage
- 計算環境：ローカル

### 環境確認

In [None]:
import sys
import pandas as pd
import sklearn

import joblib
from google.cloud import storage
import gcsfs

print('python version: ', sys.version)
print('pandas version: ', pd.__version__)
print('sklearn version: ', sklearn.__version__)
print('joblib version: ', joblib.__version__)

print('Google storage version: ', storage.__version__)
print('gcsfs version: ', gcsfs.__version__)

### 認証情報設定

In [None]:
import os

# Credentialsの設定
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'project_folder/credential-gbqtoaml-1c3df6d8f54e.json'

# Credentialsの確認
print('Credentials from environ: {}'.format(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')))

#### GCSデータロード

In [None]:
mybucketname = "mybucket-amlgcp202007"

In [None]:
# データフレーム作成
# file_path: Cloud Console [Storage] > ブラウザー > バケット名 > フォルダ > ファイル
# URIにあるパスをコピーします。

import gcsfs

project_name = "gcpのプロジェクト名を指定します"
file_path = mybucketname + '/data/breast_cancer.csv'

fs = gcsfs.GCSFileSystem(project=project_name,token=os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'))
fs.ls(mybucketname)

with fs.open(file_path, 'rb') as f:
    data = pd.read_csv(f)
data.head(2)

#### ローカルでモデル作成

In [None]:
# ライブラリのインポート
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import model_selection 
from sklearn.ensemble import RandomForestClassifier

seed = 0

# 学習とテストデータを分ける
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
kfold = model_selection.KFold(n_splits = 5)
scores = {}

# ランダムフォレスト
rfc_clf = RandomForestClassifier(max_depth=5, random_state=seed)
rfc_clf.fit(X_train, y_train)

# 結果作成
results = model_selection.cross_val_score(rfc_clf, X_test, y_test, cv = kfold)
scores[('Random Forest', 'train_score')] = results.mean()
scores[('Random Forest', 'test_score')] = rfc_clf.score(X_test, y_test)

# モデル評価
print(scores)