In [1]:
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score 
from sklearn.base import clone
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/kaggle/input/ai-competition-africa/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/ai-competition-africa/test.csv', index_col='id')

train['quality'] = train['quality'].astype(int)
train['quality'] = train['quality'].astype(int).apply(lambda x: x - 3)

X, y = train.drop('quality', axis=1), train['quality']

In [3]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights[i] for i in range(5)}

In [4]:
model = RandomForestClassifier(min_samples_leaf=6, min_samples_split=3, n_estimators=1985, n_jobs=-1, random_state=1807, class_weight=class_weights)

In [5]:
scores = []
test_pred_probs = np.zeros((test.shape[0], len(np.unique(train['quality']))))
skf = StratifiedKFold(n_splits=10, random_state=1807, shuffle=True)
for fold_idx, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = clone(model).fit(X_train, y_train)
    
    y_pred_probs = model.predict_proba(X_val)
    test_pred_probs += model.predict_proba(test) / 10
    
    score = cohen_kappa_score(y_val, np.argmax(y_pred_probs, axis=1), weights='quadratic')
    scores.append(score)
    
    print(f'Fold {fold_idx + 1} - QWK: {score:.4f}')

print(f'\nQWK: {np.mean(scores):.6f} ± {np.std(scores):.6f}')

Fold 1 - QWK: 0.4128
Fold 2 - QWK: 0.3987
Fold 3 - QWK: 0.4028
Fold 4 - QWK: 0.4320
Fold 5 - QWK: 0.4384
Fold 6 - QWK: 0.4213
Fold 7 - QWK: 0.4518
Fold 8 - QWK: 0.4300
Fold 9 - QWK: 0.4294
Fold 10 - QWK: 0.4169

QWK: 0.423416 ± 0.015424


In [6]:
sub = pd.read_csv('/kaggle/input/ai-competition-africa/sample_submission.csv', index_col='id')
sub['quality'] = np.argmax(test_pred_probs, axis=1) + 3
sub.to_csv(f'sub_rf_{np.mean(scores):.4f}.csv')
sub.head()

Unnamed: 0_level_0,quality
id,Unnamed: 1_level_1
15000,5
15001,5
15002,6
15003,6
15004,5
