In [1]:
import os
import random
import numpy as np
from operator import methodcaller


seed = 42  
old_data_path = "./datasets"
new_save_path = './splits'
split_per = 0.1

random.seed(seed)
np.random.seed(seed)
os.makedirs(new_save_path, exist_ok=True)

for dataset_file in os.listdir(old_data_path):
    if "1b_benchmark" in dataset_file:
        continue
    dataset_name = dataset_file.split(".")[0]
    data_path = os.path.join(old_data_path, dataset_file)
    with open(data_path) as file:
        data = file.read().splitlines()
    data = np.array(data)
    split = np.random.random(len(data))
    condition = split < split_per
    train_split = np.arange(len(data))[~condition]
    val_split = np.arange(len(data))[condition]
    train_data = [str(text) for text in data[train_split]]
    with open(os.path.join(new_save_path, f"{dataset_name}.train.txt"), "w+") as f:
        f.write("\n".join(train_data))
        
    val_data = [str(text) for text in data[val_split]]
    with open(os.path.join(new_save_path, f"{dataset_name}.val.txt"), "w+") as f:
        f.write("\n".join(val_data))

In [5]:
datasets = ["4dim", "news", "products", "questions"]

for dataset in datasets:
    print(f"CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/{dataset}.train.txt -p best.pretrain.model -o best.{dataset}.model")
    # print(f"CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/{dataset}.train.txt -p best.pretrain.model -o best.{dataset}.model")

CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/4dim.train.txt -p best.pretrain.model -o best.4dim.model
CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/news.train.txt -p best.pretrain.model -o best.news.model
CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/products.train.txt -p best.pretrain.model -o best.products.model
CUDA_VISIBLE_DEVICES=1 python train.py -t finetune -i datasets/questions.train.txt -p best.pretrain.model -o best.questions.model


In [7]:
for dataset in datasets:
    print(f"CUDA_VISIBLE_DEVICES=1 python evaluate.py -m best.{dataset}.model -i datasets/{dataset}.train.txt -o outputs/outputs_{dataset}.txt -t classification -l inline")

CUDA_VISIBLE_DEVICES=1 python evaluate.py -m best.4dim.model -i datasets/4dim.train.txt -o outputs/outputs_4dim.txt -t classification -l inline
CUDA_VISIBLE_DEVICES=1 python evaluate.py -m best.news.model -i datasets/news.train.txt -o outputs/outputs_news.txt -t classification -l inline
CUDA_VISIBLE_DEVICES=1 python evaluate.py -m best.products.model -i datasets/products.train.txt -o outputs/outputs_products.txt -t classification -l inline
CUDA_VISIBLE_DEVICES=1 python evaluate.py -m best.questions.model -i datasets/questions.train.txt -o outputs/outputs_questions.txt -t classification -l inline


In [15]:
datasets = ["4dim", "news", "products", "questions"]

for dataset in datasets:
    print(f"CUDA_VISIBLE_DEVICES=3 python train.py -t finetune -i splits/{dataset}.train.txt -val-path splits/{dataset}.val.txt -p best.lm.model -o best.{dataset}.model -epoch 50 -batch-size 128 -max-len 200 -lr 1e-3")

CUDA_VISIBLE_DEVICES=3 python train.py -t finetune -i splits/4dim.train.txt -val-path splits/4dim.val.txt -p best.lm.model -o best.4dim.model -epoch 50 -batch-size 128 -max-len 200 -lr 1e-3
CUDA_VISIBLE_DEVICES=3 python train.py -t finetune -i splits/news.train.txt -val-path splits/news.val.txt -p best.lm.model -o best.news.model -epoch 50 -batch-size 128 -max-len 200 -lr 1e-3
CUDA_VISIBLE_DEVICES=3 python train.py -t finetune -i splits/products.train.txt -val-path splits/products.val.txt -p best.lm.model -o best.products.model -epoch 50 -batch-size 128 -max-len 200 -lr 1e-3
CUDA_VISIBLE_DEVICES=3 python train.py -t finetune -i splits/questions.train.txt -val-path splits/questions.val.txt -p best.lm.model -o best.questions.model -epoch 50 -batch-size 128 -max-len 200 -lr 1e-3
