In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb
from tqdm import tqdm

In [2]:
all_train_embeds = np.load('/content/drive/MyDrive/all_train_embeds.npy')
all_test_embeds = np.load('/content/drive/MyDrive/all_test_embeds.npy')

In [3]:
print(f"all_train_embeds shape: {all_train_embeds.shape}, data structure: {type(all_train_embeds)}")
print(f"all_test_embeds shape: {all_test_embeds.shape}, data structure: {type(all_test_embeds)}")

all_train_embeds shape: (17307, 9216), data structure: <class 'numpy.ndarray'>
all_test_embeds shape: (3, 9216), data structure: <class 'numpy.ndarray'>


In [4]:
preprocessed_features = pd.read_csv('/content/drive/MyDrive/preprocessed_features .csv') ###Will calculate on the run for the test set
columns_to_drop = preprocessed_features.filter(regex='^tfidf_').columns
preprocessed_features = preprocessed_features.drop(columns=columns_to_drop)
preprocessed_features = preprocessed_features.drop(['essay_id'],axis=1)
print(f"preprocessed_features shape: {preprocessed_features.shape}, data structure: {type(preprocessed_features)}")




preprocessed_features shape: (17307, 89), data structure: <class 'pandas.core.frame.DataFrame'>


In [5]:
preprocessed_features_array = preprocessed_features.values

# Check if the number of rows match
if all_train_embeds.shape[0] == preprocessed_features_array.shape[0]:
    # Horizontally stack the numpy array and the converted DataFrame array
    train_features = np.hstack((all_train_embeds, preprocessed_features_array))
else:
    raise ValueError("The number of rows in all_train_embeds and preprocessed_features must match")

print(train_features.shape)

(17307, 9305)


In [6]:
test_features = train_features[-7:]#Temporary
train_features=train_features[:17300]

In [7]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"

In [8]:
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
test = train.tail(7)
train = train.head(17300)

In [9]:
def find_thresholds(true, pred, steps=50):

    # SAVE TRIALS FOR PLOTTING
    xs = [[],[],[],[],[]]
    ys = [[],[],[],[],[]]

    # COMPUTE BASELINE METRIC
    threshold = [1.5, 2.5, 3.5, 4.5, 5.5]
    pred2 = pd.cut(pred, [-np.inf] + threshold + [np.inf],
                    labels=[1,2,3,4,5,6]).astype('int32')
    best = cohen_kappa_score(true, pred2, weights="quadratic")

    # FIND FIVE OPTIMAL THRESHOLDS
    for k in range(5):
        for sign in [1,-1]:
            v = threshold[k]
            threshold2 = threshold.copy()
            stop = 0
            while stop<steps:

                # TRY NEW THRESHOLD
                v += sign * 0.001
                threshold2[k] = v
                pred2 = pd.cut(pred, [-np.inf] + threshold2 + [np.inf],
                                labels=[1,2,3,4,5,6]).astype('int32')
                metric = cohen_kappa_score(true, pred2, weights="quadratic")

                # SAVE TRIALS FOR PLOTTING
                xs[k].append(v)
                ys[k].append(metric)

                # EARLY STOPPING
                if metric<=best:
                    stop += 1
                else:
                    stop = 0
                    best = metric
                    threshold = threshold2.copy()

    # COMPUTE FINAL METRIC
    pred2 = pd.cut(pred, [-np.inf] + threshold + [np.inf],
                    labels=[1,2,3,4,5,6]).astype('int32')
    best = cohen_kappa_score(true, pred2, weights="quadratic")

    # RETURN RESULTS
    threshold = [np.round(t,3) for t in threshold]
    return best, threshold, xs, ys
def comp_score(self, y_true, y_pred):
    p = y_pred.clip(1, 6).round(0)
    m = cohen_kappa_score(y_true, p, weights='quadratic')
    return m

In [12]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

class XGBoostModel:
    def __init__(self, train, test, train_features, test_features, find_thresholds, comp_score, FOLDS=3):
        self.train = train.copy()
        self.test = test.copy()
        self.train_features = train_features
        self.test_features = test_features
        self.find_thresholds = find_thresholds
        self.comp_score_func = comp_score
        self.FOLDS = FOLDS
        self.xgb_params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'booster': 'gbtree',
            'max_depth': 6,
            'learning_rate': 0.01,
            'subsample': 0.8,
            'colsample_bytree': 0.6,
            'colsample_bylevel': 0.6,
            'n_estimators': 10000,
            'device': 'cuda',
            'early_stopping_rounds': 100,
            'gamma': 0.1,
            'min_child_weight': 1,
            'reg_alpha': 0.1,
            'reg_lambda': 1,
        }
        self.oof = np.zeros(len(self.train), dtype='float32')
        self.test_preds = np.zeros((len(self.test), self.FOLDS), dtype='float32')

    def load_kfold(self):
        self.train["fold"] = -1
        skf = StratifiedKFold(n_splits=self.FOLDS, shuffle=True, random_state=42)
        for fold, (train_index, val_index) in enumerate(skf.split(self.train, self.train["score"])):
            self.train.loc[val_index, "fold"] = fold

    def train_model(self):
        for fold in range(self.FOLDS):
            print('#'*25)
            print('### Fold', fold+1)
            print('#'*25)

            train_index = self.train["fold"] != fold
            valid_index = self.train["fold"] == fold

            X_train = self.train_features[train_index]
            y_train = self.train.loc[train_index, 'score'].values
            X_valid = self.train_features[valid_index]
            y_valid = self.train.loc[valid_index, 'score'].values

            model = xgb.XGBRegressor(**self.xgb_params)
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                verbose=100
            )

            self.oof[valid_index] = model.predict(X_valid)
            self.test_preds[:, fold] = model.predict(self.test_features)

            score = self.comp_score_func(y_valid, self.oof[valid_index])
            print(f"=> QWK score: {score}")
            print()

    def optimize_thresholds(self):
        print('#'*25)
        initial_score = self.comp_score_func(self.train['score'].values, self.oof)
        print('Overall CV QWK score before threshold optimization =', initial_score)

        self.best_score, self.best_thresholds, _, _ = self.find_thresholds(self.train['score'].values, self.oof)

        print(f"Best QWK score after threshold optimization: {self.best_score}")
        print(f"Best thresholds: {self.best_thresholds}")

        self.oof_optimized = pd.cut(self.oof, [-np.inf] + self.best_thresholds + [np.inf],
                                    labels=[1,2,3,4,5,6]).astype('int32')

        self.final_score = self.comp_score_func(self.train['score'].values, self.oof_optimized)
        print(f"Final QWK score after threshold optimization: {self.final_score}")

    def get_test_predictions(self):
        test_preds_avg = np.mean(self.test_preds, axis=1)
        return pd.cut(test_preds_avg, [-np.inf] + self.best_thresholds + [np.inf],
                      labels=[1,2,3,4,5,6]).astype('int32')

    def run(self):
        self.load_kfold()
        self.train_model()
        self.optimize_thresholds()
        return self.get_test_predictions()

In [14]:
xgb_model = XGBoostModel(train, test, train_features, test_features, find_thresholds, comp_score)
xgb_model.run()

#########################
### Fold 1
#########################
[0]	validation_0-rmse:1.03860
[100]	validation_0-rmse:0.68890
[200]	validation_0-rmse:0.59677
[300]	validation_0-rmse:0.56989
[400]	validation_0-rmse:0.55898
[500]	validation_0-rmse:0.55354
[600]	validation_0-rmse:0.54988
[700]	validation_0-rmse:0.54768
[800]	validation_0-rmse:0.54604
[900]	validation_0-rmse:0.54486
[1000]	validation_0-rmse:0.54393
[1100]	validation_0-rmse:0.54313
[1200]	validation_0-rmse:0.54265
[1300]	validation_0-rmse:0.54225
[1400]	validation_0-rmse:0.54189
[1500]	validation_0-rmse:0.54154
[1600]	validation_0-rmse:0.54132
[1700]	validation_0-rmse:0.54110
[1800]	validation_0-rmse:0.54097
[1900]	validation_0-rmse:0.54072
[2000]	validation_0-rmse:0.54062
[2100]	validation_0-rmse:0.54053
[2200]	validation_0-rmse:0.54036
[2300]	validation_0-rmse:0.54026
[2400]	validation_0-rmse:0.54017
[2500]	validation_0-rmse:0.54005
[2600]	validation_0-rmse:0.54000
[2700]	validation_0-rmse:0.53992
[2800]	validation_0-rmse:

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




TypeError: comp_score() missing 1 required positional argument: 'y_pred'

In [None]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])  # Get the last time step output
        return out

In [None]:
input_dim = features.shape[2]
hidden_dim = 64
output_dim = 1
n_layers = 2

model = GRUNet(input_dim, hidden_dim, output_dim, n_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)