# Hyperopt локально и распределенно

In [1]:
#!pip install findspark

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark

In [4]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope

import warnings
warnings.filterwarnings("ignore")

In [5]:
# !pip install hyperopt

In [6]:
df = pd.read_csv('mob_price_data/train.csv')

In [7]:
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [8]:
# split data into features and target 

X = df.drop("price_range", axis=1).values 
y = df.price_range.values

In [9]:
y

array([1, 2, 2, ..., 3, 0, 3])

In [10]:
# standardize the feature variables 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Локальное обучение с помощью HyperOpt

In [11]:
space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
    "max_depth": hp.quniform("max_depth", 1, 15, 1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

In [12]:
# define objective function

def hyperparameter_tuning(params):
    if 'max_depth' in params.keys():
        params['max_depth'] = int(params['max_depth'])
    clf = RandomForestClassifier(**params, n_jobs=-1)

    acc = cross_val_score(clf, X_scaled, y, scoring="accuracy").mean()
    # import random
    # acc = random.random()
    return {"loss": -acc, "status": STATUS_OK}

### Сначала запустить ячейку - потом рассказывать

In [13]:
# Initialize trials object
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

100%|██████████| 100/100 [05:38<00:00,  3.38s/trial, best loss: -0.8895]           


In [13]:
best

{'criterion': 1, 'max_depth': 14.0, 'n_estimators': 2}

In [15]:
trials.trials

[{'state': 2,
  'tid': 0,
  'spec': None,
  'result': {'loss': -0.6915, 'status': 'ok'},
  'misc': {'tid': 0,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'criterion': [0], 'max_depth': [0], 'n_estimators': [0]},
   'vals': {'criterion': [1], 'max_depth': [2.0], 'n_estimators': [4]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2022, 7, 18, 16, 1, 28, 492000),
  'refresh_time': datetime.datetime(2022, 7, 18, 16, 1, 32, 101000)},
 {'state': 2,
  'tid': 1,
  'spec': None,
  'result': {'loss': -0.869, 'status': 'ok'},
  'misc': {'tid': 1,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'criterion': [1], 'max_depth': [1], 'n_estimators': [1]},
   'vals': {'criterion': [0], 'max_depth': [8.0], 'n_estimators': [5]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2022, 7, 18, 16, 1, 32, 104000),
  'refresh_time': datetime.datetime(2022, 7, 

## Попробуем использовать Спарк

In [14]:
from pyspark.sql import SparkSession
from hyperopt import SparkTrials
import os

In [15]:
# ! python -m venv pyspark_venv
# ! source pyspark_venv/bin/activate
# ! pip install sklearn hyperopt venv-pack
# ! venv-pack -o pyspark_venv.tar.gz

In [16]:
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"

In [17]:
spark = SparkSession.builder. \
                    appName("my_test_app"). \
                    config(
                        "spark.yarn.dist.archives",  # 'spark.yarn.dist.archives' in YARN.
                        "pyspark_venv.tar.gz#environment").getOrCreate()

In [18]:
spark

In [23]:
# X_scaled_bc = scont.broadcast(list(X_scaled))
# y_bc = scont.broadcast(list(y))

In [24]:
def hyperparameter_tuning(params):
      # Get the broadcasted variables
#     X_scaled = X_scaled_bc.value
#     y = y_bc.value
    if 'max_depth' in params.keys():
        params['max_depth'] = int(params['max_depth'])
    clf = RandomForestClassifier(**params, n_jobs=-1)
    acc = cross_val_score(clf, X_scaled, y, scoring="accuracy").mean()
    return {"loss": -acc, "status": STATUS_OK}

In [None]:
@udf(ArrayType)
def foo(df: pd.Series):
    df = 

    return [x, x, x]

In [25]:
trials = SparkTrials(parallelism=4)

In [26]:
best_spark = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

100%|██████████| 100/100 [04:01<00:00,  2.41s/trial, best loss: -0.8899999999999999]

Total Trials: 100: 100 succeeded, 0 failed, 0 cancelled.





In [25]:
best_spark

{'criterion': 1, 'max_depth': 13.0, 'n_estimators': 3}

In [None]:
# spark.stop()