# Setup

In [1]:
import gc
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.metrics import median_absolute_error
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor

SEED = 55

In [2]:
DATA_DIR = '/kaggle/input/playground-series-s3e25'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original_train = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Mineral_Dataset_Supplementary_Info.csv')
original_val = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Artificial_Crystals_Dataset.csv')

# Data overview

In [3]:
train.sample(random_state=SEED)

Unnamed: 0,id,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,Hardness
6232,6232,106.0,8.337992,10.4,4.8,21.989312,11.33044,2.502,1.708,0.944,0.495936,0.82753,6.0


In [4]:
test.sample(random_state=SEED)

Unnamed: 0,id,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
1805,12212,50.0,3.253996,10.0,4.8,20.016938,11.64556,2.774,1.714,0.882,0.499726,0.77105


In [5]:
original_train.sample(random_state=SEED)

Unnamed: 0.1,Unnamed: 0,Hardness,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
63,64,2.3,110.0,23.0,36.666667,2.666667,82.598467,8.504133,2.146667,2.006667,1.253333,0.456803,7.666667


In [6]:
original_val.sample(random_state=SEED)

Unnamed: 0.1,Unnamed: 0,Formula,Crystal structure,Hardness (Mohs),allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
6,6,BiB3O6,monoclinic,5.75,146.0,16.864992,14.6,5.0,33.739258,11.38881,2.866,1.695,0.786,0.478464,1.686499


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10407 entries, 0 to 10406
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     10407 non-null  int64  
 1   allelectrons_Total     10407 non-null  float64
 2   density_Total          10407 non-null  float64
 3   allelectrons_Average   10407 non-null  float64
 4   val_e_Average          10407 non-null  float64
 5   atomicweight_Average   10407 non-null  float64
 6   ionenergy_Average      10407 non-null  float64
 7   el_neg_chi_Average     10407 non-null  float64
 8   R_vdw_element_Average  10407 non-null  float64
 9   R_cov_element_Average  10407 non-null  float64
 10  zaratio_Average        10407 non-null  float64
 11  density_Average        10407 non-null  float64
 12  Hardness               10407 non-null  float64
dtypes: float64(12), int64(1)
memory usage: 1.0 MB


In [8]:
original_train.shape, original_val.shape, train.shape, test.shape, sample_sub.shape

((622, 13), (52, 15), (10407, 13), (6939, 12), (6939, 2))

**Missing values**

In [9]:
original_train.isna().any().sum(), original_val.isna().any().sum(), train.isna().any().sum(), test.isna().any().sum()

(0, 1, 0, 0)

In [10]:
original_val[original_val.isna().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,Formula,Crystal structure,Hardness (Mohs),allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
15,15,BaWF8,,3.0,202.0,22.81264,20.2,6.0,47.315423,15.24581,3.443,1.662,0.836,0.460005,2.281264


We can keep this row since we do not care about crystal structure based on other datasets.

**Fixing column names and removing irrelevant columns**

In [11]:
original_val.rename({'Hardness (Mohs)': 'Hardness'}, axis=1, inplace=True)

In [12]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
original_train.drop('Unnamed: 0', axis=1, inplace=True)
original_val.drop(['Unnamed: 0', 'Formula', 'Crystal structure'], axis=1, inplace=True)

In [13]:
column_order = list(train.columns)

original_train = original_train[column_order]
original_val = original_val[column_order]

**Target distribution**

In [14]:
TARGET = 'Hardness'

pd.DataFrame({
    'train': train[TARGET], 
    'original_train': original_train[TARGET], 
    'original_val': original_val[TARGET]
}).describe()

Unnamed: 0,train,original_train,original_val
count,10407.0,622.0,52.0
mean,4.647126,4.613987,5.448462
std,1.680525,1.729636,1.523433
min,1.0,1.0,2.5
25%,3.0,3.0,4.5
50%,5.5,5.5,5.5
75%,6.0,5.95,6.5
max,10.0,10.0,9.0


**Combining original train and validation datasets**

In [15]:
original = pd.concat([original_train, original_val], axis=0, ignore_index=True)

In [16]:
original.shape

(674, 12)

# Cross-validation framework

In [17]:
def eval_metric(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [18]:
def custom_cv(features, model, fit_params=None, folds=7, extend=False, seed=SEED, verbose=True):
    oof_preds = {}
    test_preds = {}
    
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    cv = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        if extend: # original dataset added only to training folds
            X_train = pd.concat([X_train, original[features]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        if fit_params:
            model.fit(X_train, y_train, **fit_params) # for models like XGBoost or LightGBM
        else:
            model.fit(X_train, y_train) # for most sklearn models
        
        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)
        
        if verbose:
            score = eval_metric(y_val, val_preds)
            print(f'Fold #{fold}: {score:.4f}', end = ' | ')
            
        _ = gc.collect()
        
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'OOF score: {eval_metric(y, oof_preds):.4f}\n')
    
    return oof_preds, test_preds

In [19]:
model = ExtraTreesRegressor(
    n_estimators=250,
    max_depth=12,
    max_features=None,
    n_jobs = 4,
    random_state=SEED)

features = test.columns

In [20]:
%%time
oof_preds_trn, test_preds_trn = custom_cv(features, model)

Fold #0: 0.7319 | Fold #1: 0.6764 | Fold #2: 0.6463 | Fold #3: 0.6457 | Fold #4: 0.6415 | Fold #5: 0.7548 | Fold #6: 0.7277 | OOF score: 0.6883

CPU times: user 30.6 s, sys: 786 ms, total: 31.4 s
Wall time: 12.2 s


In [21]:
%%time
oof_preds_ext, test_preds_ext = custom_cv(features, model, extend=True)

Fold #0: 0.7171 | Fold #1: 0.6738 | Fold #2: 0.6497 | Fold #3: 0.6494 | Fold #4: 0.6401 | Fold #5: 0.7487 | Fold #6: 0.7159 | OOF score: 0.6785

CPU times: user 32.8 s, sys: 724 ms, total: 33.6 s
Wall time: 13.1 s


# Submission files

In [22]:
def create_submission_files(test_preds, config, notebook='00'):
    for col in test_preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col].round(4)
        sub.to_csv(f'{notebook}_{config}_{col}.csv', index=False)

In [23]:
create_submission_files(test_preds_trn, 'baseline_trn')
create_submission_files(test_preds_ext, 'baseline_ext')

In [24]:
!head 00_baseline_trn_mean.csv

id,Hardness
10407,2.65
10408,2.9092
10409,5.6402
10410,4.7828
10411,5.2723
10412,4.7614
10413,3.7718
10414,5.4142
10415,3.3939


**Time to submit!**