In [1]:
import pandas as pd
from src.rf_clf import build_df
from src.tools import *

import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils.fixes import loguniform

In [2]:
df_test_prob = pd.read_csv("test_roi_probs.csv")
df_train_prob = pd.read_csv("train_roi_probs.csv")

In [3]:
df_test_prob.head()

Unnamed: 0,filename,x1,y1,x2,y2,prob0,prob1,prob2,prob3,t_i,t_w,t_h,t_r,t_g,t_b,label
0,C12_B129_S12.tif,99700,18756,101004,20060,0.025281,0.021503,0.952472,0.000744,1,32,25,221.999145,216.268482,226.030069,3
1,C12_B129_S12.tif,84052,22672,85356,23976,0.000393,0.007502,0.991786,0.00032,1,32,25,221.999145,216.268482,226.030069,3
2,C12_B129_S12.tif,97744,22672,99048,23976,0.05264,0.153568,0.019106,0.774686,1,32,25,221.999145,216.268482,226.030069,3
3,C12_B129_S12.tif,84052,24628,85356,25932,1e-05,0.000156,0.186166,0.813668,1,32,25,221.999145,216.268482,226.030069,3
4,C12_B129_S12.tif,93828,24628,95132,25932,0.004806,0.016912,0.837155,0.141126,1,32,25,221.999145,216.268482,226.030069,3


In [4]:
df_train_prob.info()
print("Train filenames: ", df_train_prob.filename.nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48874 entries, 0 to 48873
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   filename  48874 non-null  object 
 1   x1        48874 non-null  int64  
 2   y1        48874 non-null  int64  
 3   x2        48874 non-null  int64  
 4   y2        48874 non-null  int64  
 5   prob0     48874 non-null  float64
 6   prob1     48874 non-null  float64
 7   prob2     48874 non-null  float64
 8   prob3     48874 non-null  float64
 9   t_i       48874 non-null  int64  
 10  t_w       48874 non-null  int64  
 11  t_h       48874 non-null  int64  
 12  label     48874 non-null  int64  
 13  t_r       48833 non-null  float64
 14  t_g       48833 non-null  float64
 15  t_b       48833 non-null  float64
dtypes: float64(7), int64(8), object(1)
memory usage: 6.0+ MB
Train filenames:  812


In [5]:
df_train_preproc = build_df(df_train_prob)
df_test_preproc = build_df(df_test_prob)
df_train_preproc.head()

Unnamed: 0_level_0,pred_class0,pred_class1,pred_class2,pred_class3,prob0_mean,prob0_max,prob0_min,prob0_var,prob0_std,prob1_mean,...,t_w_min,t_h_mean,t_h_max,t_h_min,t_r,t_g,t_b,t_n,pred_n,label
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C01_B008_S01.tif,21,7,3,2,0.627128,0.9992,0.0008384111,0.114022,0.337671,0.209677,...,12,11.333333,12,10,173.264463,149.071502,178.926353,4,33,0
C01_B027_S01.tif,48,9,3,1,0.775709,0.999701,0.001193536,0.108295,0.329082,0.14797,...,7,13.688525,15,7,190.432047,163.929417,191.064514,6,61,1
C01_B029_S01.tif,20,15,3,1,0.51662,0.999978,1.560627e-05,0.199281,0.446409,0.375149,...,8,12.128205,15,9,188.196594,168.97267,189.00932,12,39,0
C01_B043_S01.tif,34,26,11,4,0.438018,0.999435,2.376554e-06,0.159875,0.399843,0.345781,...,17,20.306667,26,10,205.274395,191.809306,202.402798,6,75,0
C01_B078_S01.tif,8,12,196,2,0.036864,0.967573,2.913089e-11,0.025239,0.158869,0.055425,...,24,19.022936,20,18,180.20195,166.705705,184.98364,6,218,2


In [6]:
from sklearn.metrics import fbeta_score, make_scorer

def competition_metric(y_true, y_pred):
    er = ERROR_TABLE[y_true, y_pred].sum()
    return 1 - er / y_true.shape[0]

competition_metric_scorer = make_scorer(competition_metric, greater_is_better=True)


In [7]:
parameters = {'bootstrap': [True, False],
              'max_depth': [35, 40, 45, 50],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [2, 3],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [10, 50, 100, 150, 200, 250, 300]}
rf = RandomForestClassifier()

# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, ,0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
#     }
# rf = GradientBoostingClassifier()


clf = GridSearchCV(rf, parameters, cv=5, scoring=competition_metric_scorer, verbose=1, n_jobs=24)
clf.fit(df_train_preproc.iloc[:, 0:-1], df_train_preproc.iloc[:, -1])

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

y_true, y_pred = df_test_preproc.iloc[:, -1], clf.predict(df_test_preproc.iloc[:, 0:-1])
print(f"Val metric: {competition_metric(y_true, y_pred):.3f}")
print()

Fitting 5 folds for each of 672 candidates, totalling 3360 fits


[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    3.1s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    6.5s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:   11.4s
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:   17.7s
[Parallel(n_jobs=24)]: Done 1752 tasks      | elapsed:   25.3s
[Parallel(n_jobs=24)]: Done 2402 tasks      | elapsed:   35.8s
[Parallel(n_jobs=24)]: Done 3152 tasks      | elapsed:   48.0s
[Parallel(n_jobs=24)]: Done 3360 out of 3360 | elapsed:   51.6s finished


Best parameters set found on development set:

{'bootstrap': True, 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}

Grid scores on development set:

0.889 (+/-0.027) for {'bootstrap': True, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 10}
0.901 (+/-0.020) for {'bootstrap': True, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 50}
0.900 (+/-0.014) for {'bootstrap': True, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 100}
0.900 (+/-0.010) for {'bootstrap': True, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 150}
0.902 (+/-0.012) for {'bootstrap': True, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 200}
0.902 (+/-0.013) for {'bootstrap': True, 

In [10]:
best_clf = clf.best_estimator_
list(sorted(zip(best_clf.feature_importances_, df_test_preproc.columns[:-1]), reverse=True))

[(0.07428612950097722, 'pred_class3'),
 (0.06893124078437403, 'prob3_mean'),
 (0.0684068980909899, 'prob0_min'),
 (0.06525667974022563, 'prob2_max'),
 (0.05697057597403419, 'prob0_mean'),
 (0.054829240296012695, 'prob3_max'),
 (0.051123797613821784, 'prob1_min'),
 (0.04908709660089365, 'prob2_mean'),
 (0.04676302372587938, 'prob1_mean'),
 (0.03235076151467822, 'pred_class2'),
 (0.027297218549727928, 'prob3_std'),
 (0.02571325593096593, 'prob1_var'),
 (0.025692416169582892, 'prob3_var'),
 (0.024110610567442115, 'prob0_max'),
 (0.023814852680504598, 'prob2_var'),
 (0.023568607712024958, 't_w_mean'),
 (0.023330479095950096, 'prob1_std'),
 (0.023316440401135542, 'prob2_std'),
 (0.023078695785719715, 'prob1_max'),
 (0.02206352088907146, 't_w_max'),
 (0.020334301778226514, 't_h_mean'),
 (0.02031000715939124, 'pred_n'),
 (0.018774362309433534, 't_h_max'),
 (0.017224692597500202, 'prob0_std'),
 (0.016731255839093503, 'prob0_var'),
 (0.015414285307384866, 'pred_class1'),
 (0.012747640815941576,

In [9]:
#import pickle
#pickle.dump(clf.best_estimator_, open("models/rf-clf.pkl", 'wb'))
