# Train a Random Forest model with Scikit Learn on a Notebook

## Setup environment

In [None]:
!pip install -q -r ../requirements.txt

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn import ensemble
from sklearn.model_selection import train_test_split

## Get California Housing dataset and save to CSV

In [None]:
df = datasets.fetch_california_housing(as_frame=True).frame

In [None]:
df.head()

## Prepare California Housing dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, df.columns != "MedHouseVal"],
    df["MedHouseVal"],
    random_state=42
)

## Train

In [None]:
model = ensemble.RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## Evaluate

In [None]:
print('Model score:', model.score(X_test, y_test))

## Save Model

In [None]:
!mkdir model

In [None]:
import joblib

joblib.dump(model, './model/rf_model.joblib')

In [None]:
loaded_model = joblib.load('./model/rf_model.joblib')
print('Loaded model score:', loaded_model.score(X_test, y_test))

## Upload dataset to S3 bucket

In [None]:
!mkdir data

In [None]:
import sagemaker

# save dataset into a csv file
df.to_csv("data/housing.csv", sep = ',', index = False)

# upload csv file to S3 in the SageMaker default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "housing-example/data/raw"

s3_location = sagemaker_session.upload_data(path="./data", bucket=bucket, key_prefix=prefix)

print(s3_location)