### If this notebook is helpful, please upvote [the original version](https://www.kaggle.com/code/abhishek/tez-for-feedback-v2-0)! (score: 0.627)

# 1. Import & Def & Set & Load

In [None]:
import tree
import glob

import pandas as pd
import seaborn as sns

In [None]:
def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)

In [None]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

In [None]:
submission_path = "../input/feedback-prize-effectiveness/sample_submission.csv"
submission_origin = pd.read_csv(submission_path)

In [None]:
submission_origin.head()

# 2. Check Input / Output

In [None]:
!tree -L 3 --filelimit=10 ../input

In [None]:
# ./kaggle/working (Output)
!tree .

# 3. Extract predictions

In [None]:
!cp -r ../input/tez-lib/ .
!cd tez-lib && pip install .
!cp ../input/fb2debertav3large/*.py .

In [None]:
# ./kaggle/working (Output)
!tree -L 2 .

### How predictions will extract

```
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 0 \  <<< [0, 1, 2, 3, 4]
--predict
```

### Description of the process logic (main.py)

```
if __name__ == "__main__":
    args = parse_args()

    if args.predict:
        predict(args)

    [...]
```

### 1. Step 1 (args = parse_args())

```
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--fold", type=int, required=False, default=0)
    parser.add_argument("--model", type=str, required=False, default="microsoft/deberta-base")
    parser.add_argument("--lr", type=float, required=False, default=3e-5)
    parser.add_argument("--output", type=str, default=".", required=False)
    parser.add_argument("--input", type=str, default="../input", required=False)
    parser.add_argument("--max_len", type=int, default=1024, required=False)
    parser.add_argument("--batch_size", type=int, default=2, required=False)
    parser.add_argument("--valid_batch_size", type=int, default=16, required=False)
    parser.add_argument("--epochs", type=int, default=5, required=False)
    parser.add_argument("--accumulation_steps", type=int, default=1, required=False)
    parser.add_argument("--predict", action="store_true", required=False)
    return parser.parse_args()
```

### 2. Step 2 (predict(args))

```
def predict(args):
    NUM_JOBS = 2
    seed_everything(42)
    df = pd.read_csv(os.path.join(args.input, "test.csv"))
    df.loc[:, "discourse_effectiveness"] = "Adequate"

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    samples = prepare_training_data(df, tokenizer, args, num_jobs=NUM_JOBS, is_train=False)
    samples = list(sorted(samples, key=lambda d: len(d["input_ids"])))

    dataset = FeedbackDataset(samples, args, tokenizer)
    num_train_steps = int(len(dataset) / args.batch_size / args.accumulation_steps * args.epochs)
    
    [...]

    preds = np.vstack(preds)

    sample_submission = pd.read_csv(os.path.join(args.input, "sample_submission.csv"))
    sample_submission.loc[:, "discourse_id"] = [x["discourse_id"] for x in samples]
    sample_submission.loc[:, "Ineffective"] = preds[:, 0]
    sample_submission.loc[:, "Adequate"] = preds[:, 1]
    sample_submission.loc[:, "Effective"] = preds[:, 2]
    sample_submission.to_csv(f"preds_{args.fold}.csv", index=False)
```

In [None]:
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 0 \
--predict

In [None]:
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 1 \
--predict

In [None]:
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 2 \
--predict

In [None]:
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 3 \
--predict

In [None]:
!python main.py \
--model ../input/deberta-v3-large/deberta-v3-large \
--output ../input/fb2debertav3large \
--input ../input/feedback-prize-effectiveness/ \
--batch_size 2 \
--fold 4 \
--predict

### The calculation time for one fold is about 1 hour, all 5 folds are about 5 hours.

## Score:
- fold 0: 0.650
- fold 1: 0.662
- fold 2: 0.652
- fold 3: 0.656
- fold 4: 0.648
 

In [None]:
# ./kaggle/working (Output)
!tree -L 2 .

# 4. Collect & Check predictions

In [None]:
ID_COL = "discourse_id"
empty_df = submission_origin[ID_COL].to_frame().set_index(ID_COL)

ineffective = empty_df.copy()
effective = empty_df.copy()
adequate = empty_df.copy()

csvs = glob.glob("*.csv")

for idx, csv in enumerate(csvs):
    df = pd.read_csv(csv).set_index(ID_COL)
    
    ineffective[idx] = df['Ineffective']
    effective[idx] = df['Effective']
    adequate[idx] = df['Adequate']

In [None]:
show_gradient(
    ineffective,
    N_ROW
)

In [None]:
show_gradient(
    effective,
    N_ROW
)

In [None]:
show_gradient(
    adequate,
    N_ROW
)

# 5. Create submission

In [None]:
submission = submission_origin.copy().set_index(ID_COL)

# *** Baseline ***
submission['Ineffective'] = ineffective.mean(axis=1)
submission['Effective'] = effective.mean(axis=1)
submission['Adequate'] = adequate.mean(axis=1)

# *** Use weights ***
# weights_ = [.2, .2, .2, .2, .2]
# submission['Ineffective'] = ineffective.mul(weights_).sum(axis=1)
# submission['Effective'] = effective.mul(weights_).sum(axis=1)
# submission['Adequate'] = adequate.mul(weights_).sum(axis=1)

submission.reset_index(inplace=True)
submission.sort_values(ID_COL, inplace=True)
submission.reset_index(drop=True, inplace=True)

In [None]:
submission.head(N_ROW)

In [None]:
# 0	2e214524dbe3	0.0345	0.7430	0.2225
# 1	5a88900e7dc1	0.0643	0.7839	0.1518
# 2	739a6d00f44a	0.0581	0.7537	0.1883
# 3	75ce6d68b67b	0.0691	0.7597	0.1712
# 4	84812fc2ab9f	0.0324	0.6514	0.3162
# 5	93578d946723	0.0491	0.7361	0.2148
# 6	9790d835736b	0.0316	0.7073	0.2611
# 7	a261b6e14276	0.0157	0.5161	0.4682
# 8	bcfae2c9a244	0.0457	0.7948	0.1594
# 9	c668ff840720	0.0363	0.7305	0.2331

In [None]:
submission.to_csv("submission.csv", index=False)