In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, confusion_matrix
import time
from sklearn.metrics import precision_score, make_scorer,recall_score,f1_score,roc_auc_score
import warnings
import csv
from os import listdir, chdir
from os.path import isfile, join

In [2]:
def hyperparameter_tune(base_model, parameters, kfold, X, y, groups):
    start_time = time.time()
    k = GroupKFold(n_splits=kfold)

    prec_1 = make_scorer(precision_score, pos_label=1)
    rec_1 = make_scorer(recall_score, pos_label=1)
    f1_1 = make_scorer(f1_score, pos_label=1)
    roc = make_scorer(roc_auc_score)
    prec_0 = make_scorer(precision_score, pos_label=0)
    rec_0 = make_scorer(recall_score, pos_label=0)
    f1_0 = make_scorer(f1_score, pos_label=0)

    scoring_st = {'prec_1': prec_1, 'rec_1': rec_1, 'f1_1': f1_1, 'roc': roc, 'prec_0': prec_0, 'rec_0': rec_0,
              'f1_0': f1_0}

    optimal_model = RandomizedSearchCV(base_model,
                                      param_distributions=parameters,
                                      n_iter=200,
                                      cv=k,
                                      scoring = scoring_st,
                                      n_jobs=10,
                                      refit='rec_1',
                                      verbose=3,
                                      return_train_score=True)
                                      #random_state=SEED)

    optimal_model.fit(X, y, groups)

    stop_time = time.time()
    #scores = cross_validate(optimal_model, X, y, cv=k, scoring= scoring_st, return_train_score=True, return_estimator=True)
    print("Elapsed Time:", time.strftime("%H:%M:%S", time.gmtime(stop_time - start_time)))
    print("====================")
    #print("Cross Val Mean: {:.3f}, Cross Val Stdev: {:.3f}".format(scores.mean(), scores.std()))
    print("Best Score: {:.3f}".format(optimal_model.best_score_))
    print("Best Parameters: {}".format(optimal_model.best_params_))
    return optimal_model.best_params_, optimal_model.best_score_, optimal_model.cv_results

In [3]:
mypath = '/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity'

In [4]:
allfiles = [f for f in listdir(mypath) if f.endswith('norm.csv') and f[0].isdigit()]

In [5]:
allfiles

['2016_norm.csv',
 '2015_norm.csv',
 '2014_norm.csv',
 '2017_norm.csv',
 '2012_norm.csv',
 '2011_norm.csv',
 '2010_norm.csv',
 '2018_norm.csv',
 '2013_norm.csv']

In [6]:
chdir(mypath)

In [7]:
li = []
for filename in allfiles:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
frame = pd.concat(li, axis=0, ignore_index=True)

In [8]:
frame.firedate = pd.to_datetime(frame.firedate).dt.strftime('%Y%m%d')

In [9]:
df_part = frame[['id', 'firedate', 'max_temp', 'min_temp', 'mean_temp','res_max', 'dom_vel', 'rain_7days', 'dem', 'slope', 'curvature',
       'aspect', 'ndvi_new', 'evi', 'lst_day', 'lst_night', 'max_dew_temp','mean_dew_temp', 'min_dew_temp', 'fire', 'dir_max_1', 'dir_max_2',
       'dir_max_3', 'dir_max_4', 'dir_max_5', 'dir_max_6', 'dir_max_7','dir_max_8', 'dom_dir_1', 'dom_dir_2', 'dom_dir_3', 'dom_dir_4',
       'dom_dir_5', 'dom_dir_6', 'dom_dir_7', 'dom_dir_8', 'corine_111','corine_112', 'corine_121', 'corine_122', 'corine_123', 'corine_124',
       'corine_131', 'corine_132', 'corine_133', 'corine_141', 'corine_142','corine_211', 'corine_212', 'corine_213', 'corine_221', 'corine_222',
       'corine_223', 'corine_231', 'corine_241', 'corine_242', 'corine_243','corine_244', 'corine_311', 'corine_312', 'corine_313', 'corine_321',
       'corine_322', 'corine_323', 'corine_324', 'corine_331', 'corine_332','corine_333', 'corine_334', 'corine_411', 'corine_412', 'corine_421',
       'corine_422', 'corine_511', 'corine_512', 'corine_521', 'wkd_0','wkd_1', 'wkd_2', 'wkd_3', 'wkd_4', 'wkd_5', 'wkd_6', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_4', 'frequency','f81', 'x', 'y']].copy()

X_unnorm, y_int = df_part[['max_temp', 'min_temp', 'mean_temp','res_max', 'dom_vel', 'rain_7days', 'dem', 'slope', 'curvature',
       'aspect', 'ndvi_new', 'evi', 'lst_day', 'lst_night', 'max_dew_temp','mean_dew_temp', 'min_dew_temp', 'dir_max_1', 'dir_max_2',
       'dir_max_3', 'dir_max_4', 'dir_max_5', 'dir_max_6', 'dir_max_7','dir_max_8', 'dom_dir_1', 'dom_dir_2', 'dom_dir_3', 'dom_dir_4',
       'dom_dir_5', 'dom_dir_6', 'dom_dir_7', 'dom_dir_8', 'corine_111','corine_112', 'corine_121', 'corine_122', 'corine_123', 'corine_124',
       'corine_131', 'corine_132', 'corine_133', 'corine_141', 'corine_142','corine_211', 'corine_212', 'corine_213', 'corine_221', 'corine_222',
       'corine_223', 'corine_231', 'corine_241', 'corine_242', 'corine_243','corine_244', 'corine_311', 'corine_312', 'corine_313', 'corine_321',
       'corine_322', 'corine_323', 'corine_324', 'corine_331', 'corine_332','corine_333', 'corine_334', 'corine_411', 'corine_412', 'corine_421',
       'corine_422', 'corine_511', 'corine_512', 'corine_521', 'wkd_0','wkd_1', 'wkd_2', 'wkd_3', 'wkd_4', 'wkd_5', 'wkd_6', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_4', 'frequency','f81', 'x', 'y']], df_part['fire']

groups = frame['firedate']

#X = normalize_dataset(X_unnorm, 'std')
y = y_int

X_ = X_unnorm.values
#X_ = X.values
y_ = y.values
groupskfold = groups.values

rf = RandomForestClassifier(n_jobs=-1)
depth = [10, 20, 100, 200, 400,500, 700, 1000, 1200,2000, None]
n_estimators = [50, 100, 120, 150,170,200, 250, 350, 500, 750, 1000,1400, 1500]
min_samples_split = [2, 10, 50, 70,100,120,150,180, 200, 250,400,600,1000, 1300, 2000]
min_samples_leaf = [1, 10,30,40,50,100,120,150] #with numbers
max_features = list(range(1,X_.shape[1]))
bootstrap = [True, False]
criterion = ["gini", "entropy"]
class_weights = [{0:1,1:9},{0:1,1:300},{0:1,1:400},{0:1,1:500},{0:1,1:1000}]

In [10]:
lots_of_parameters = {
    "max_depth": depth, #depth of each tree
    "n_estimators": n_estimators, #trees of the forest
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "criterion": criterion,
    "max_features": max_features,
    "bootstrap": bootstrap,
    "class_weight": class_weights
}


best_scores = []
best_parameters = []
full_scores = []
folds = [10]#range(2, 8)

columns_sel = ['mean_test_acc','std_test_acc', 'mean_train_acc', 'std_train_acc','mean_test_AUC','std_test_AUC', 'mean_train_AUC', 'std_train_AUC',
              'mean_test_prec','std_test_prec','mean_train_prec','std_train_prec', 'mean_test_rec','std_test_rec',
              'mean_train_rec','std_train_rec', 'mean_test_f_score','std_test_f_score', 'mean_train_f_score','std_train_f_score',
               'params','folds']

results = pd.DataFrame(columns=columns_sel)

for i in folds:
    print("\ncv = ", i)
    start = time.time()
    best_params, best_score, full_scores = hyperparameter_tune(rf, lots_of_parameters, i, X_, y_, groupskfold)

    df_results = pd.DataFrame.from_dict(full_scores)
    df_results['folds'] = int(i)
    #df_results.to_csv('/home/sgirtsou/Documents/GridSearchCV/RF/RFcv_25kbalanced_noshufflestrictcriterion.csv')

    df_short.to_csv('rf_random_search.csv')


cv =  10
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0: 1, 1: 9}, bootstrap=False 
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0: 1, 1: 9}, bootstrap=False 
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0: 1, 1: 9}, bootstrap=False 
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0: 1, 1: 9}, bootstrap=False 
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0: 1, 1: 9}, bootstrap=False 
[CV] n_estimators=200, min_samples_split=180, min_samples_leaf=50, max_features=25, max_depth=100, criterion=gini, class_weight={0

JoblibRuntimeError: JoblibRuntimeError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7fb2d2d6a1e0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/apps/applications/python/anaconda3/5.0.1/lib/py...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/apps/applic.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7fb2d2d6a1e0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/apps/applications/python/anaconda3/5.0.1/lib/py...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/apps/applic.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2021, 4, 12, 12, 38, 21, 586289, tzinfo=tzutc()), 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'session': 'C5862273C9D244BCA09FCFD7ABDD778B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'C5862273C9D244BCA09FCFD7ABDD778B']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2021, 4, 12, 12, 38, 21, 586289, tzinfo=tzutc()), 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'session': 'C5862273C9D244BCA09FCFD7ABDD778B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'C5862273C9D244BCA09FCFD7ABDD778B'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2021, 4, 12, 12, 38, 21, 586289, tzinfo=tzutc()), 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'session': 'C5862273C9D244BCA09FCFD7ABDD778B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '6A4618395CCC4412971D89EF39B98536', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')',), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')',)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')', store_history=True, silent=False, shell_futures=True)
   2693                 self.displayhook.exec_result = result
   2694 
   2695                 # Execute the user code
   2696                 interactivity = "none" if silent else self.ast_node_interactivity
   2697                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2698                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2699                 
   2700                 self.last_execution_succeeded = not has_raised
   2701 
   2702                 # Reset this so later displayed values do not modify the

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.For object>], cell_name='<ipython-input-10-ab11823add8f>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7fb282a16320, executi..._before_exec=None error_in_exec=None result=None>)
   2797 
   2798         try:
   2799             for i, node in enumerate(to_run_exec):
   2800                 mod = ast.Module([node])
   2801                 code = compiler(mod, cell_name, "exec")
-> 2802                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fb282a18300, file "<ipython-input-10-ab11823add8f>", line 25>
        result = <ExecutionResult object at 7fb282a16320, executi..._before_exec=None error_in_exec=None result=None>
   2803                     return True
   2804 
   2805             for i, node in enumerate(to_run_interactive):
   2806                 mod = ast.Interactive([node])

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fb282a18300, file "<ipython-input-10-ab11823add8f>", line 25>, result=<ExecutionResult object at 7fb282a16320, executi..._before_exec=None error_in_exec=None result=None>)
   2857         outflag = True  # happens in more places, so it's easier as default
   2858         try:
   2859             try:
   2860                 self.hooks.pre_run_code_hook()
   2861                 #rprint('Running code', repr(code_obj)) # dbg
-> 2862                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fb282a18300, file "<ipython-input-10-ab11823add8f>", line 25>
        self.user_global_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'GroupKFold': <class 'sklearn.model_selection._split.GroupKFold'>, 'In': ['', 'import numpy as np\nimport pandas as pd\nfrom skle...t listdir, chdir\nfrom os.path import isfile, join', 'def hyperparameter_tune(base_model, parameters, ...timal_model.best_score_, optimal_model.cv_results', "mypath = '/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity'", "allfiles = [f for f in listdir(mypath) if f.endswith('norm.csv') and f[0].isdigit()]", 'allfiles', 'chdir(mypath)', 'li = []\nfor filename in allfiles:\n    df = pd.re...\nframe = pd.concat(li, axis=0, ignore_index=True)', "frame.firedate = pd.to_datetime(frame.firedate).dt.strftime('%Y%m%d')", "df_part = frame[['id', 'firedate', 'max_temp', '...{0:1,1:300},{0:1,1:400},{0:1,1:500},{0:1,1:1000}]", 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')'], 'KFold': <class 'sklearn.model_selection._split.KFold'>, 'Out': {5: ['2016_norm.csv', '2015_norm.csv', '2014_norm.csv', '2017_norm.csv', '2012_norm.csv', '2011_norm.csv', '2010_norm.csv', '2018_norm.csv', '2013_norm.csv']}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, 'X_': array([[ 0.652     ,  0.797     ,  0.728     , ....  0.09141531,
         0.74536127,  0.90534205]]), ...}
        self.user_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'GroupKFold': <class 'sklearn.model_selection._split.GroupKFold'>, 'In': ['', 'import numpy as np\nimport pandas as pd\nfrom skle...t listdir, chdir\nfrom os.path import isfile, join', 'def hyperparameter_tune(base_model, parameters, ...timal_model.best_score_, optimal_model.cv_results', "mypath = '/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity'", "allfiles = [f for f in listdir(mypath) if f.endswith('norm.csv') and f[0].isdigit()]", 'allfiles', 'chdir(mypath)', 'li = []\nfor filename in allfiles:\n    df = pd.re...\nframe = pd.concat(li, axis=0, ignore_index=True)', "frame.firedate = pd.to_datetime(frame.firedate).dt.strftime('%Y%m%d')", "df_part = frame[['id', 'firedate', 'max_temp', '...{0:1,1:300},{0:1,1:400},{0:1,1:500},{0:1,1:1000}]", 'lots_of_parameters = {\n    "max_depth": depth, #...ams")\n    df_short.to_csv(\'rf_random_search.csv\')'], 'KFold': <class 'sklearn.model_selection._split.KFold'>, 'Out': {5: ['2016_norm.csv', '2015_norm.csv', '2014_norm.csv', '2017_norm.csv', '2012_norm.csv', '2011_norm.csv', '2010_norm.csv', '2018_norm.csv', '2013_norm.csv']}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'RandomizedSearchCV': <class 'sklearn.model_selection._search.RandomizedSearchCV'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, 'X_': array([[ 0.652     ,  0.797     ,  0.728     , ....  0.09141531,
         0.74536127,  0.90534205]]), ...}
   2863             finally:
   2864                 # Reset our crash handler in place
   2865                 sys.excepthook = old_excepthook
   2866         except SystemExit as e:

...........................................................................
/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity/<ipython-input-10-ab11823add8f> in <module>()
     23 results = pd.DataFrame(columns=columns_sel)
     24 
     25 for i in folds:
     26     print("\ncv = ", i)
     27     start = time.time()
---> 28     best_params, best_score, full_scores = hyperparameter_tune(rf, lots_of_parameters, i, X_, y_, groupskfold)
     29 
     30     df_results = pd.DataFrame.from_dict(full_scores)
     31     df_results['folds'] = int(i)
     32     #df_results.to_csv('/home/sgirtsou/Documents/GridSearchCV/RF/RFcv_25kbalanced_noshufflestrictcriterion.csv')

...........................................................................
/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity/<ipython-input-2-fa568b5055a5> in hyperparameter_tune(base_model=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), parameters={'bootstrap': [True, False], 'class_weight': [{0: 1, 1: 9}, {0: 1, 1: 300}, {0: 1, 1: 400}, {0: 1, 1: 500}, {0: 1, 1: 1000}], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 100, 200, 400, 500, 700, 1000, 1200, 2000, None], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...], 'min_samples_leaf': [1, 10, 30, 40, 50, 100, 120, 150], 'min_samples_split': [2, 10, 50, 70, 100, 120, 150, 180, 200, 250, 400, 600, 1000, 1300, 2000], 'n_estimators': [50, 100, 120, 150, 170, 200, 250, 350, 500, 750, 1000, 1400, 1500]}, kfold=10, X=array([[ 0.652     ,  0.797     ,  0.728     , ....  0.09141531,
         0.74536127,  0.90534205]]), y=array([ 1.,  1.,  1., ...,  0.,  0.,  0.]), groups=array(['20160822', '20160822', '20160721', ..., ...8', '20130805',
       '20130727'], dtype=object))
     22                                       refit='rec_1',
     23                                       verbose=3,
     24                                       return_train_score=True)
     25                                       #random_state=SEED)
     26 
---> 27     optimal_model.fit(X, y, groups)
     28 
     29     stop_time = time.time()
     30     #scores = cross_validate(optimal_model, X, y, cv=k, scoring= scoring_st, return_train_score=True, return_estimator=True)
     31     print("Elapsed Time:", time.strftime("%H:%M:%S", time.gmtime(stop_time - start_time)))

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=RandomizedSearchCV(cv=GroupKFold(n_splits=10), e...rer(f1_score, pos_label=0)},
          verbose=3), X=array([[ 0.652     ,  0.797     ,  0.728     , ....  0.09141531,
         0.74536127,  0.90534205]]), y=array([ 1.,  1.,  1., ...,  0.,  0.,  0.]), groups=array(['20160822', '20160822', '20160721', ..., ...8', '20130805',
       '20130727'], dtype=object), **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method _BaseKFold.split of GroupKFold(n_splits=10)>
        X = array([[ 0.652     ,  0.797     ,  0.728     , ....  0.09141531,
         0.74536127,  0.90534205]])
        y = array([ 1.,  1.,  1., ...,  0.,  0.,  0.])
        groups = array(['20160822', '20160822', '20160721', ..., ...8', '20130805',
       '20130727'], dtype=object)
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=10), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=10)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
RuntimeError                                       Mon Apr 12 15:38:22 2021
PID: 2258Python 3.6.3: /apps/applications/python/anaconda3/5.0.1/bin/python
...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (RandomForestClassifier(bootstrap=False, class_we...  random_state=None, verbose=0, warm_start=False), memmap([[ 0.652     ,  0.797     ,  0.728     , ... 0.09141531,
          0.74536127,  0.90534205]]), array([ 1.,  1.,  1., ...,  0.,  0.,  0.]), {'f1_0': make_scorer(f1_score, pos_label=0), 'f1_1': make_scorer(f1_score, pos_label=1), 'prec_0': make_scorer(precision_score, pos_label=0), 'prec_1': make_scorer(precision_score, pos_label=1), 'rec_0': make_scorer(recall_score, pos_label=0), 'rec_1': make_scorer(recall_score, pos_label=1), 'roc': make_scorer(roc_auc_score)}, array([    0,     1,     2, ..., 29654, 29655, 29656]), array([   13,    14,    15, ..., 29594, 29604, 29608]), 3, {'bootstrap': False, 'class_weight': {0: 1, 1: 9}, 'criterion': 'gini', 'max_depth': 100, 'max_features': 25, 'min_samples_leaf': 50, 'min_samples_split': 180, 'n_estimators': 200}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (RandomForestClassifier(bootstrap=False, class_we...  random_state=None, verbose=0, warm_start=False), memmap([[ 0.652     ,  0.797     ,  0.728     , ... 0.09141531,
          0.74536127,  0.90534205]]), array([ 1.,  1.,  1., ...,  0.,  0.,  0.]), {'f1_0': make_scorer(f1_score, pos_label=0), 'f1_1': make_scorer(f1_score, pos_label=1), 'prec_0': make_scorer(precision_score, pos_label=0), 'prec_1': make_scorer(precision_score, pos_label=1), 'rec_0': make_scorer(recall_score, pos_label=0), 'rec_1': make_scorer(recall_score, pos_label=1), 'roc': make_scorer(roc_auc_score)}, array([    0,     1,     2, ..., 29654, 29655, 29656]), array([   13,    14,    15, ..., 29594, 29604, 29608]), 3, {'bootstrap': False, 'class_weight': {0: 1, 1: 9}, 'criterion': 'gini', 'max_depth': 100, 'max_features': 25, 'min_samples_leaf': 50, 'min_samples_split': 180, 'n_estimators': 200})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=RandomForestClassifier(bootstrap=False, class_we...  random_state=None, verbose=0, warm_start=False), X=memmap([[ 0.652     ,  0.797     ,  0.728     , ... 0.09141531,
          0.74536127,  0.90534205]]), y=array([ 1.,  1.,  1., ...,  0.,  0.,  0.]), scorer={'f1_0': make_scorer(f1_score, pos_label=0), 'f1_1': make_scorer(f1_score, pos_label=1), 'prec_0': make_scorer(precision_score, pos_label=0), 'prec_1': make_scorer(precision_score, pos_label=1), 'rec_0': make_scorer(recall_score, pos_label=0), 'rec_1': make_scorer(recall_score, pos_label=1), 'roc': make_scorer(roc_auc_score)}, train=array([    0,     1,     2, ..., 29654, 29655, 29656]), test=array([   13,    14,    15, ..., 29594, 29604, 29608]), verbose=3, parameters={'bootstrap': False, 'class_weight': {0: 1, 1: 9}, 'criterion': 'gini', 'max_depth': 100, 'max_features': 25, 'min_samples_leaf': 50, 'min_samples_split': 180, 'n_estimators': 200}, fit_params={}, return_train_score=True, return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method BaseForest.fit of RandomForestClas... random_state=None, verbose=0, warm_start=False)>
        X_train = memmap([[ 0.652     ,  0.797     ,  0.728     , ... 0.09141531,
          0.74536127,  0.90534205]])
        y_train = array([ 1.,  1.,  1., ...,  0.,  0.,  0.])
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self=RandomForestClassifier(bootstrap=False, class_we...  random_state=None, verbose=0, warm_start=False), X=array([[ 0.65200001,  0.79699999,  0.72799999, ....        0.74536127,  0.90534204]], dtype=float32), y=array([[ 1.],
       [ 1.],
       [ 1.],
       ..., 
       [ 0.],
       [ 0.],
       [ 0.]]), sample_weight=array([ 9.,  9.,  9., ...,  1.,  1.,  1.]))
    323             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    324                              backend="threading")(
    325                 delayed(_parallel_build_trees)(
    326                     t, self, X, y, sample_weight, i, len(trees),
    327                     verbose=self.verbose, class_weight=self.class_weight)
--> 328                 for i, t in enumerate(trees))
        i = 199
    329 
    330             # Collect newly grown trees
    331             self.estimators_.extend(trees)
    332 

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseForest.fit.<locals>.<genexpr>>)
    744             raise ValueError('This Parallel instance is already running')
    745         # A flag used to abort the dispatching of jobs in case an
    746         # exception is found
    747         self._aborting = False
    748         if not self._managed_backend:
--> 749             n_jobs = self._initialize_backend()
        n_jobs = undefined
        self._initialize_backend = <bound method Parallel._initialize_backend of Parallel(n_jobs=-1)>
    750         else:
    751             n_jobs = self._effective_n_jobs()
    752 
    753         iterator = iter(iterable)

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _initialize_backend(self=Parallel(n_jobs=-1))
    542 
    543     def _initialize_backend(self):
    544         """Build a process or thread pool and return the number of workers"""
    545         try:
    546             n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
--> 547                                              **self._backend_args)
        self._backend_args = {'context': <multiprocessing.context.ForkContext object>, 'max_nbytes': 1048576, 'mmap_mode': 'r', 'temp_folder': None, 'verbose': 0}
    548             if self.timeout is not None and not self._backend.supports_timeout:
    549                 warnings.warn(
    550                     'The backend class {!r} does not support timeout. '
    551                     "You have set 'timeout={}' in Parallel but "

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in configure(self=<sklearn.externals.joblib._parallel_backends.ThreadingBackend object>, n_jobs=32, parallel=Parallel(n_jobs=-1), **backend_args={'context': <multiprocessing.context.ForkContext object>, 'max_nbytes': 1048576, 'mmap_mode': 'r', 'temp_folder': None, 'verbose': 0})
    245         n_jobs = self.effective_n_jobs(n_jobs)
    246         if n_jobs == 1:
    247             # Avoid unnecessary overhead and use sequential backend instead.
    248             raise FallbackToBackend(SequentialBackend())
    249         self.parallel = parallel
--> 250         self._pool = ThreadPool(n_jobs)
        self._pool = undefined
        n_jobs = 32
    251         return n_jobs
    252 
    253 
    254 class MultiprocessingBackend(PoolManagerMixin, AutoBatchingMixin,

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py in __init__(self=<multiprocessing.pool.ThreadPool object>, processes=32, initializer=None, initargs=())
    784     def Process(*args, **kwds):
    785         from .dummy import Process
    786         return Process(*args, **kwds)
    787 
    788     def __init__(self, processes=None, initializer=None, initargs=()):
--> 789         Pool.__init__(self, processes, initializer, initargs)
        self = <multiprocessing.pool.ThreadPool object>
        processes = 32
        initializer = None
        initargs = ()
    790 
    791     def _setup_queues(self):
    792         self._inqueue = queue.Queue()
    793         self._outqueue = queue.Queue()

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py in __init__(self=<multiprocessing.pool.ThreadPool object>, processes=32, initializer=None, initargs=(), maxtasksperchild=None, context=None)
    169         if initializer is not None and not callable(initializer):
    170             raise TypeError('initializer must be a callable')
    171 
    172         self._processes = processes
    173         self._pool = []
--> 174         self._repopulate_pool()
        self._repopulate_pool = <bound method Pool._repopulate_pool of <multiprocessing.pool.ThreadPool object>>
    175 
    176         self._worker_handler = threading.Thread(
    177             target=Pool._handle_workers,
    178             args=(self, )

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py in _repopulate_pool(self=<multiprocessing.pool.ThreadPool object>)
    234                                    self._wrap_exception)
    235                             )
    236             self._pool.append(w)
    237             w.name = w.name.replace('Process', 'PoolWorker')
    238             w.daemon = True
--> 239             w.start()
        w.start = <bound method DummyProcess.start of <DummyProcess(Thread-16, initial daemon)>>
    240             util.debug('added worker')
    241 
    242     def _maintain_pool(self):
    243         """Clean up any exited workers and start replacements for them.

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/multiprocessing/dummy/__init__.py in start(self=<DummyProcess(Thread-16, initial daemon)>)
     43     def start(self):
     44         assert self._parent is current_process()
     45         self._start_called = True
     46         if hasattr(self._parent, '_children'):
     47             self._parent._children[self] = None
---> 48         threading.Thread.start(self)
        self = <DummyProcess(Thread-16, initial daemon)>
     49 
     50     @property
     51     def exitcode(self):
     52         if self._start_called and not self.is_alive():

...........................................................................
/apps/applications/python/anaconda3/5.0.1/lib/python3.6/threading.py in start(self=<DummyProcess(Thread-16, initial daemon)>)
    841         if self._started.is_set():
    842             raise RuntimeError("threads can only be started once")
    843         with _active_limbo_lock:
    844             _limbo[self] = self
    845         try:
--> 846             _start_new_thread(self._bootstrap, ())
        self._bootstrap = <bound method Thread._bootstrap of <DummyProcess(Thread-16, initial daemon)>>
    847         except Exception:
    848             with _active_limbo_lock:
    849                 del _limbo[self]
    850             raise

RuntimeError: can't start new thread
___________________________________________________________________________

In [14]:
frame.to_csv('full_dataset_norm.csv')

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/work2/pa21/sgirtsou/production/datasets/hard_cosine_similarity/full_dataset_norm.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'firedate', 'max_temp', 'min_temp',
       'mean_temp', 'res_max', 'dom_vel', 'rain_7days', 'dem', 'slope',
       'curvature', 'aspect', 'ndvi_new', 'evi', 'lst_day', 'lst_night',
       'max_dew_temp', 'mean_dew_temp', 'min_dew_temp', 'fire', 'dir_max_1',
       'dir_max_2', 'dir_max_3', 'dir_max_4', 'dir_max_5', 'dir_max_6',
       'dir_max_7', 'dir_max_8', 'dom_dir_1', 'dom_dir_2', 'dom_dir_3',
       'dom_dir_4', 'dom_dir_5', 'dom_dir_6', 'dom_dir_7', 'dom_dir_8',
       'corine_111', 'corine_112', 'corine_121', 'corine_122', 'corine_123',
       'corine_124', 'corine_131', 'corine_132', 'corine_133', 'corine_141',
       'corine_142', 'corine_211', 'corine_212', 'corine_213', 'corine_221',
       'corine_222', 'corine_223', 'corine_231', 'corine_241', 'corine_242',
       'corine_243', 'corine_244', 'corine_311', 'corine_312', 'corine_313',
       'corine_321', 'corine_322', 'corine_323', 'corine_324', 'corine_331',
       'corine_332', 