In [1]:
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score 
from sklearn.base import clone
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/kaggle/input/ai-competition-africa/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/ai-competition-africa/test.csv', index_col='id')

train['quality'] = train['quality'].astype(int)
train['quality'] = train['quality'].astype(int).apply(lambda x: x - 3)

X, y = train.drop('quality', axis=1), train['quality']

In [3]:
classes = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))

In [4]:
model = RandomForestClassifier(min_samples_leaf=4, min_samples_split=9, n_estimators=1320, n_jobs=-1, random_state=1807, class_weight=class_weight_dict)

In [5]:
scores = []
test_pred_probs = np.zeros((test.shape[0], len(np.unique(train['quality']))))
skf = StratifiedKFold(n_splits=10, random_state=1807, shuffle=True)
for fold_idx, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = clone(model).fit(X_train, y_train)
    
    y_pred_probs = model.predict_proba(X_val)
    test_pred_probs += model.predict_proba(test) / 10
    
    score = cohen_kappa_score(y_val, np.argmax(y_pred_probs, axis=1), weights='quadratic')
    scores.append(score)
    
    print(f'Fold {fold_idx + 1} - QWK: {score:.4f}')

print(f'\nQWK: {np.mean(scores):.6f} ± {np.std(scores):.6f}')

Fold 1 - QWK: 0.4081
Fold 2 - QWK: 0.4017
Fold 3 - QWK: 0.3979
Fold 4 - QWK: 0.4302
Fold 5 - QWK: 0.4358
Fold 6 - QWK: 0.4229
Fold 7 - QWK: 0.4523
Fold 8 - QWK: 0.4251
Fold 9 - QWK: 0.4304
Fold 10 - QWK: 0.4210

QWK: 0.422525 ± 0.015634


In [6]:
sub = pd.read_csv('/kaggle/input/ai-competition-africa/sample_submission.csv', index_col='id')
sub['quality'] = np.argmax(test_pred_probs, axis=1) + 3
sub.to_csv(f'sub_rf_{np.mean(scores):.4f}.csv')
sub.head()

Unnamed: 0_level_0,quality
id,Unnamed: 1_level_1
15000,5
15001,5
15002,6
15003,6
15004,5
