### Импорт библиотек

In [2]:
import pandas as pd
import numpy as np
import xgboost
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import collections
import seaborn as sns 
from SALib.sample import saltelli
from SALib.analyze import sobol

from sklearn import model_selection, ensemble, utils
from sklearn.model_selection import GridSearchCV

### Генерация выборки для предсказания

In [18]:
problem = {
    'num_vars': 28,
    'names': list(data_X.columns),
    'bounds': [[data_X['boundary_condition_NZ'].min(), data_X['boundary_condition_NZ'].max()],
               [data_X['boundary_condition_X'].min(), data_X['boundary_condition_X'].max()],
               [data_X['boundary_condition_Y'].min(), data_X['boundary_condition_Y'].max()],
               [data_X['boundary_condition_NX'].min(), data_X['boundary_condition_NX'].max()],
               [data_X['boundary_condition_NY'].min(), data_X['boundary_condition_NY'].max()],
               [data_X['boundary_condition_Z'].min(), data_X['boundary_condition_Z'].max()],
               [data_X['load_time'].min(), data_X['load_time'].max()],
               [data_X['rock_young_constant'].min(), data_X['rock_young_constant'].max()],
               [data_X['rock_alpha_constant'].min(), data_X['rock_alpha_constant'].max()],
               [data_X['concrete_init_temp'].min(), data_X['concrete_init_temp'].max()],
               [data_X['concrete_cheat'].min(), data_X['concrete_cheat'].max()],
               [data_X['concrete_dt'].min(), data_X['concrete_dt'].max()],
               [data_X['concrete_norm_coeff'].min(), data_X['concrete_norm_coeff'].max()],
               [data_X['concrete_young_constant'].min(), data_X['concrete_young_constant'].max()],
               [data_X['concrete_alpha_constant'].min(), data_X['concrete_alpha_constant'].max()],
               [data_X['concrete_strength_time'].min(), data_X['concrete_strength_time'].max()],
               [data_X['steel_init_temp'].min(), data_X['steel_init_temp'].max()],
               [data_X['bentonite_init_temp'].min(), data_X['bentonite_init_temp'].max()],
               [data_X['bentonite_cheat'].min(), data_X['bentonite_cheat'].max()],
               [data_X['bentonite_dt'].min(), data_X['bentonite_dt'].max()],
               [data_X['bentonite_young_constant'].min(), data_X['bentonite_young_constant'].max()],
               [data_X['bentonite_alpha_constant'].min(), data_X['bentonite_alpha_constant'].max()],
               [data_X['rw_init_temp'].min(), data_X['rw_init_temp'].max()],
               [data_X['rw_cheat'].min(), data_X['rw_cheat'].max()],
               [data_X['rw_dt'].min(), data_X['rw_dt'].max()],
               [data_X['rw_young_constant'].min(), data_X['rw_young_constant'].max()],
               [data_X['rw_alpha_constant'].min(), data_X['rw_alpha_constant'].max()],
               [data_X['rw_norm_coeff'].min(), data_X['rw_norm_coeff'].max()]]
}


param_values = saltelli.sample(problem, 128)


### RandomForest

In [23]:
rf = ensemble.RandomForestRegressor(n_estimators = 150)

In [24]:
%%time

data_X = pd.read_csv('sample.csv')
data_X.drop('calc_id', 1, inplace = True)
data_X = data_X.iloc[:-1576]
data_X.load_time = data_X.load_time.round(decimals = 0)

points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point)
    
    data_temp = pd.read_csv('temp.csv')
    data_temp.drop('calc_id', 1, inplace = True)
    
    for j in [5,10,20,30,40,50,150,1000]:
        for i in points:
            data_temp.drop(columns = 'temp_point'+str(i)+'_'+str(j)+'days', inplace = True)
    data_temp = data_temp.iloc[:-1576]



    sobol_indices = pd.DataFrame(index = list(data_X.columns))
    feature_importances = pd.DataFrame(index = list(data_X.columns))
    for j in [5,10,20,30,40,50,150,1000]:

        rf.fit(data_X, data_temp['temp_point' + str(observe_point)+ '_' + str(j) + 'days'])
        predicted = rf.predict(param_values)
        feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  rf.feature_importances_)
        
        Si = sobol.analyze(problem, predicted)
        sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])
    feature_importances.to_csv('feature_importances_temp_point' +str(k)+'.csv', float_format='%.6f')
    sobol_indices.to_csv('sobol_indices_temp_point' +str(k)+'.csv', float_format='%.6f')

Wall time: 11min 39s


In [25]:
%%time

data_X = pd.read_csv('sample.csv')
data_X.drop('calc_id', 1, inplace = True)
data_X = data_X.iloc[:-1576]
data_X.load_time = data_X.load_time.round(decimals = 0)


points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point)
    
    data_stress = pd.read_csv('stress.csv')
    data_stress.drop('calc_id', 1, inplace = True)
    
    for j in [5,10,20,30,40,50,150,1000]:
        for i in points:
            data_stress.drop(columns = 'stress_point'+str(i)+'_'+str(j)+'days', inplace = True)
    data_stress = data_stress.iloc[:-1576]



    sobol_indices = pd.DataFrame(index = list(data_X.columns))
    feature_importances = pd.DataFrame(index = list(data_X.columns))
    for j in [5,10,20,30,40,50,150,1000]:

        rf.fit(data_X, data_stress['stress_point' + str(observe_point)+ '_' + str(j) + 'days'])
        predicted = rf.predict(param_values)
        Si = sobol.analyze(problem, predicted)
        feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  rf.feature_importances_)
        sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])

    sobol_indices.to_csv('sobol_indices_stress_point' +str(k)+'.csv', float_format='%.7f')
    feature_importances.to_csv('feature_importances_stress_point' +str(k)+'.csv', float_format='%.6f')

Wall time: 13min 16s


### XGBoost

In [16]:
XGB = xgboost.XGBRegressor()

In [76]:
%%time

data_X = pd.read_csv('sample.csv')
data_X.drop('calc_id', 1, inplace = True)
data_X = data_X.iloc[:-1576]
data_X.load_time = data_X.load_time.round(decimals = 0)

points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point)
    
    data_temp = pd.read_csv('temp.csv')
    data_temp.drop('calc_id', 1, inplace = True)
    
    for j in [5,10,20,30,40,50,150,1000]:
        for i in points:
            data_temp.drop(columns = 'temp_point'+str(i)+'_'+str(j)+'days', inplace = True)
    data_temp = data_temp.iloc[:-1576]



    sobol_indices = pd.DataFrame(index = list(data_X.columns))
    feature_importances = pd.DataFrame(index = list(data_X.columns))
    for j in [5,10,20,30,40,50,150,1000]:

        XGB.fit(data_X, data_temp['temp_point' + str(observe_point)+ '_' + str(j) + 'days'])
        predicted = XGB.predict(param_values)
        feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  XGB.feature_importances_)
        
        Si = sobol.analyze(problem, predicted)
        sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])
    feature_importances.to_csv('feature_importances_temp_point' +str(k)+'.csv', float_format='%.6f')
    sobol_indices.to_csv('sobol_indices_temp_point' +str(k)+'.csv', float_format='%.6f')

Wall time: 1min 26s


In [20]:
%%time

data_X = pd.read_csv('sample.csv')
data_X.drop('calc_id', 1, inplace = True)
data_X = data_X.iloc[:-1576]
data_X.load_time = data_X.load_time.round(decimals = 0)

points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point)
    
    data_stress = pd.read_csv('stress.csv')
    data_stress.drop('calc_id', 1, inplace = True)
    
    for j in [5,10,20,30,40,50,150,1000]:
        for i in points:
            data_stress.drop(columns = 'stress_point'+str(i)+'_'+str(j)+'days', inplace = True)
    data_stress = data_stress.iloc[:-1576]



    sobol_indices = pd.DataFrame(index = list(data_X.columns))
    feature_importances = pd.DataFrame(index = list(data_X.columns))
    for j in [5,10,20,30,40,50,150,1000]:

        XGB.fit(data_X, data_stress['stress_point' + str(observe_point)+ '_' + str(j) + 'days'])
        predicted = XGB.predict(param_values)
        Si = sobol.analyze(problem, predicted)
        feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  XGB.feature_importances_)
        sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])

    sobol_indices.to_csv('sobol_indices_stress_point' +str(k)+'.csv', float_format='%.7f')
    feature_importances.to_csv('feature_importances_stress_point' +str(k)+'.csv', float_format='%.6f')

  return np.mean(B * (AB - A), axis=0) / np.var(np.r_[A, B], axis=0)
  return np.mean(B * (AB - A), axis=0) / np.var(np.r_[A, B], axis=0)
  return 0.5 * np.mean((A - AB) ** 2, axis=0) / np.var(np.r_[A, B], axis=0)
  return 0.5 * np.mean((A - AB) ** 2, axis=0) / np.var(np.r_[A, B], axis=0)
  Vjk = np.mean(BAj * ABk - A * B, axis=0) / np.var(np.r_[A, B], axis=0)
  Vjk = np.mean(BAj * ABk - A * B, axis=0) / np.var(np.r_[A, B], axis=0)
  Y = (Y - Y.mean()) / Y.std()


Wall time: 1min 21s


### GridSearch

In [None]:
param_grid = {
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [1, 2, 3],
    'max_depth': [50, 75, 100, 125],
    'n_estimators': [75, 100, 125, 150]
}

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

    data_temp = pd.read_csv('temp.csv')
    data_temp.drop('calc_id', 1, inplace = True)
    data_temp = data_temp.iloc[:-1576]

grid_search.fit(data_X, data_temp['temp_point' + str(5)+ '_' + str(40) + 'days'])

grid_search.best_params_

### Пример работы для диплома (useless)

In [None]:
rf = ensemble.RandomForestRegressor(max_depth = 125,
 max_features = None,
 min_samples_leaf = 1,
 n_estimators = 150)

S_indices = pd.DataFrame(index = list(data_X.columns))

rf.fit(data_X, data_temp['temp_point5_40days'])
Si = sobol.analyze(problem, rf.predict(param_values))
S_indices.insert(len(S_indices.columns), 'ST_5_40_days' ,Si['ST'])

S_indices.sort_values(by = ['ST_5_40_days'], ascending = False)

In [None]:
data_temp['temp_point5_40days'][88]

In [None]:
model_selection.cross_val_score(rf, data_X, data_temp['temp_point5_5days'],).mean()

Wall time: 27.2 s


0.9956628086647825

In [26]:
model_selection.cross_val_score(XGB, data_X, data_temp['temp_point7_5days'], ).mean()

0.9952007809796971

### Counter

In [26]:
nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\RandomForest_results\\sobol_indicess_temp_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

SI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
SI_temp_counter.sort_values(by = 0, ascending = False).to_csv('RF_SI_temp.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\RandomForest_results\\sobol_indices_stress_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

SI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
SI_stress_counter.sort_values(by = 0, ascending = False).to_csv('RF_SI_stress.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\RandomForest_results\\feature_importances_temp_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

FI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
FI_temp_counter.sort_values(by = 0, ascending = False).to_csv('RF_FI_temp.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\RandomForest_results\\feature_importances_stress_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

FI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
FI_stress_counter.sort_values(by = 0, ascending = False).to_csv('RF_FI_stress.csv')

In [28]:
nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\XGBoost_results\\sobol_indicess_temp_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

SI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
SI_temp_counter.sort_values(by = 0, ascending = False).to_csv('XGB_SI_temp.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\XGBoost_results\\sobol_indices_stress_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

SI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
SI_stress_counter.sort_values(by = 0, ascending = False).to_csv('XGB_SI_stress.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\XGBoost_results\\feature_importances_temp_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

FI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
FI_temp_counter.sort_values(by = 0, ascending = False).to_csv('XGB_FI_temp.csv')

nlargest = np.ndarray(0)
points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
for k in range(1,14,1):
    observe_point = k
    points.remove(observe_point) 
    feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\XGBoost_results\\feature_importances_stress_point'+str(observe_point)+'.csv',
                           index_col = 0)
    for j in [5,10,20,30,40,50,150,1000]:
        nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

FI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
FI_stress_counter.sort_values(by = 0, ascending = False).to_csv('XGB_FI_stress.csv')

### XGBoost samples variety

In [216]:
XGB = xgboost.XGBRegressor()

In [29]:
%%time
for n in [1,2,3,4,5]:
    
    param_values = saltelli.sample(problem, pow(2,n))

    data_X = pd.read_csv('sample.csv')
    data_X.drop('calc_id', 1, inplace = True)
    data_X = data_X.iloc[:-(9000-pd.DataFrame(param_values).shape[0])]
    data_X.load_time = data_X.load_time.round(decimals = 0)

    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        
        observe_point = k
        points.remove(observe_point)

        data_temp = pd.read_csv('temp.csv')
        data_temp.drop('calc_id', 1, inplace = True)
        data_temp = data_temp.iloc[:-(9000-pd.DataFrame(param_values).shape[0])]
        data = data_X.join(data_temp)
        
        for j in [5,10,20,30,40,50,150,1000]:
            for i in points:
                data.drop(columns = 'temp_point'+str(i)+'_'+str(j)+'days', inplace = True)
        



        sobol_indices = pd.DataFrame(index = list(data_X.columns))
        feature_importances = pd.DataFrame(index = list(data_X.columns))
        CVS_df = pd.DataFrame(index = ['cross_val_score'])
        
        for j in [5,10,20,30,40,50,150,1000]:

            CVS = model_selection.cross_val_score(XGB, data_X, data['temp_point' + str(observe_point)+ '_' + str(j) + 'days']).mean()
            CVS_df.insert(len(CVS_df.columns), 'CVS_'+str(observe_point)+'_'+str(j)+'days', CVS)
            XGB.fit(data_X, data['temp_point' + str(observe_point)+ '_' + str(j) + 'days'])
            predicted = XGB.predict(param_values)
            feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  XGB.feature_importances_)

            Si = sobol.analyze(problem, predicted)
            sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])
        feature_importances.to_csv(str(n)+'_'+'feature_importances_temp_point' +str(k)+'.csv', float_format='%.6f')
        sobol_indices.to_csv(str(n)+'_'+'sobol_indicess_temp_point' +str(k)+'.csv', float_format='%.6f')
        CVS_df.to_csv(str(n)+'_'+'cross_validation_temp_point' +str(k)+'.csv', float_format='%.6f')

Wall time: 11min 5s


In [None]:
%%time 
for n in [1,2,3,4,5]:
    
    param_values = saltelli.sample(problem, pow(2,n))

    data_X = pd.read_csv('sample.csv')
    data_X.drop('calc_id', 1, inplace = True)
    data_X = data_X.iloc[:-(9000-pd.DataFrame(param_values).shape[0])]
    data_X.load_time = data_X.load_time.round(decimals = 0)

    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        observe_point = k
        points.remove(observe_point)

        data_stress = pd.read_csv('stress.csv')
        data_stress.drop('calc_id', 1, inplace = True)

        for j in [5,10,20,30,40,50,150,1000]:
            for i in points:
                data_stress.drop(columns = 'stress_point'+str(i)+'_'+str(j)+'days', inplace = True)
        data_stress = data_stress.iloc[:-(9000-pd.DataFrame(param_values).shape[0])]



        sobol_indices = pd.DataFrame(index = list(data_X.columns))
        feature_importances = pd.DataFrame(index = list(data_X.columns))
        CVS_df = pd.DataFrame(index = ['cross_val_score'])
        for j in [5,10,20,30,40,50,150,1000]:

            CVS = model_selection.cross_val_score(XGB, data_X, data_stress['stress_point' + str(observe_point)+ '_' + str(j) + 'days']).mean()
            CVS_df.insert(len(CVS_df.columns), 'CVS_'+str(observe_point)+'_'+str(j)+'days', CVS)
            XGB.fit(data_X, data_stress['stress_point' + str(observe_point)+ '_' + str(j) + 'days'])
            predicted = XGB.predict(param_values)
            Si = sobol.analyze(problem, predicted)
            feature_importances.insert(len(feature_importances.columns),'FI_'+str(observe_point)+'_'+str(j)+'days',  XGB.feature_importances_)
            sobol_indices.insert(len(sobol_indices.columns), 'ST_'+str(observe_point)+'_'+str(j)+'days' ,Si['ST'])

        sobol_indices.to_csv(str(n)+'_'+'sobol_indices_stress_point' +str(k)+'.csv', float_format='%.7f')
        feature_importances.to_csv(str(n)+'_'+'feature_importances_stress_point' +str(k)+'.csv', float_format='%.6f')
        CVS_df.to_csv(str(n)+'_'+'cross_validation_stress_point' +str(k)+'.csv', float_format='%.6f')

### Counter variety

In [201]:
for n in [1,2,3,4,5]:
    nlargest = np.ndarray(0)
    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        observe_point = k
        points.remove(observe_point) 
        sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\XGB_variety\\'+str(n)+'_'+'sobol_indicess_temp_point'+str(observe_point)+'.csv',
                               index_col = 0)
        for j in [20,30,40,50,150,1000]:
            nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

    SI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
    SI_temp_counter.sort_values(by = 0, ascending = False).to_csv(str(n)+'_'+'XGB_SI_temp.csv')

    nlargest = np.ndarray(0)
    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        observe_point = k
        points.remove(observe_point) 
        sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\XGB_variety\\'+str(n)+'_'+'sobol_indices_stress_point'+str(observe_point)+'.csv',
                               index_col = 0)
        for j in [20,30,40,50,150,1000]:
            nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(observe_point) + '_' + str(j) + 'days')['ST_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

    SI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
    SI_stress_counter.sort_values(by = 0, ascending = False).to_csv(str(n)+'_'+'XGB_SI_stress.csv')

    nlargest = np.ndarray(0)
    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        observe_point = k
        points.remove(observe_point) 
        feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\XGB_variety\\'+str(n)+'_'+'feature_importances_temp_point'+str(observe_point)+'.csv',
                               index_col = 0)
        for j in [20,30,40,50,150,1000]:
            nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

    FI_temp_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
    FI_temp_counter.sort_values(by = 0, ascending = False).to_csv(str(n)+'_'+'XGB_FI_temp.csv')

    nlargest = np.ndarray(0)
    points = [1,2,3,4,5,6,7,8,9,10,11,12,13]
    for k in range(1,14,1):
        observe_point = k
        points.remove(observe_point) 
        feature_importances = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\XGB_variety\\'+str(n)+'_'+'feature_importances_stress_point'+str(observe_point)+'.csv',
                               index_col = 0)
        for j in [20,30,40,50,150,1000]:
            nlargest = np.append(nlargest, feature_importances.nlargest(5, 'FI_' + str(observe_point) + '_' + str(j) + 'days')['FI_' + str(observe_point) + '_' + str(j) + 'days'].index.to_numpy())

    FI_stress_counter = pd.DataFrame.from_dict(dict(collections.Counter(nlargest)), orient='index')
    FI_stress_counter.sort_values(by = 0, ascending = False).to_csv(str(n)+'_'+'XGB_FI_stress.csv')

In [74]:
nlargest = np.ndarray(0)

sobol_indices = pd.read_csv('C:\\Users\\defuz\\JupyterLab\\data diplom\\load_time_rounded\\XGBoost_results\\sobol_indicess_temp_point'+str(5)+'.csv',
                           index_col = 0)

nlargest = np.append(nlargest, sobol_indices.nlargest(5, 'ST_' + str(5) + '_' + str(40) + 'days')['ST_' + str(5) + '_' + str(40) + 'days'].index.to_numpy())

In [75]:
sobol_indices.loc['load_time',:].to_numpy()

array([1.027307e+00, 7.570720e-01, 3.176210e-01, 1.114820e-01,
       4.074400e-02, 3.743800e-02, 8.294000e-03, 1.000000e-05])