In [3]:
!pip install datasets
!pip install scikit-learn
!pip install openpyxl
!pip install transformers
!pip install pandas

Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m973.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting fsspec[http]>=2021.11.1
  Downloading fsspec-2023.5.0-py3-none-any.whl (160 k

In [5]:
import torch
import random
import numpy as np
import pandas as pd
import os
import json
from sklearn import preprocessing
from typing import List, Dict
from datasets import load_dataset
import numpy as np
import random
import collections
import os
import json
from typing import List, Dict

def set_random_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    # torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_random_seed(42)

### Generate 10 versions of each dataset as changing the random_seed.

In [10]:
# Dataset: bank77

OUTPUT_DIR = './data'
DATA_NM = 'bank'

def get_jsonl_data(jsonl_path: str):
    assert jsonl_path.endswith(".jsonl")
    out = list()
    with open(jsonl_path, 'r', encoding="utf-8") as file:
        for line in file:
            j = json.loads(line.strip())
            out.append(j)
    return out

def raw_data_to_dict(data, shuffle=True):
    labels_dict = collections.defaultdict(list)
    for item in data:
        labels_dict[item['label']].append(item)
    labels_dict = dict(labels_dict)
    if shuffle:
        for key, val in labels_dict.items():
            random.shuffle(val)
    return labels_dict

def dir_check(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path+'/full'):
        os.makedirs(path+'/full')


def get_pandas_data(df):
    out = list()
    for idx in df.index:
        out.append({
            "global_ix" : idx,
            "sentence": df.loc[idx, 'text'],
            "label": str(df.loc[idx, 'label'])
        })
    return out

def write_jsonl_data(jsonl_data: List[Dict], jsonl_path: str, force=False):
    if os.path.exists(jsonl_path) and not force:
        raise FileExistsError
    with open(jsonl_path, 'w') as file:
        for line in jsonl_data:
            file.write(json.dumps(line, ensure_ascii=False) + '\n')

def write_full_txt_data(jsonl_data: List[Dict], txt_path: str, force=False):
    if os.path.exists(txt_path) and not force:
        raise FileExistsError
    with open(txt_path, 'w') as file:
        for line in jsonl_data:
            file.write(line['sentence'] + '\n')

            

dataset = load_dataset("banking77")

train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
df = pd.concat([train_df, test_df],ignore_index=True)

train_classes = list(range(77))
eval_classes = random.sample(train_classes,30)

for c in eval_classes:
    train_classes.remove(c)

train_df = df.loc[df['label'].isin(train_classes)]
eval_df = df.loc[df['label'].isin(eval_classes)]

train_cls = set(train_df['label'])
eval_cls = set(eval_df['label'])
print(f"# of train classes is {len(train_cls)}")
print(f"# of test classes is {len(eval_cls)}")

if len(eval_cls.intersection(train_cls)) == 0 and len(train_cls.intersection(eval_cls)) == 0:
    print("OK: train dataset and test dataset are disjoint")
else:
    print("WARN: train dataset and test dataset are not disjoint")

dir_check(OUTPUT_DIR + '/' +DATA_NM + f'/{str(rs)}')

out = get_pandas_data(train_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/train.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-train.txt', force=True)
print("Making train data is done")


out = get_pandas_data(eval_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/test.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-test.txt', force=True)
print("Making test data is done")

    


Found cached dataset banking77 (/home/qonejung/.cache/huggingface/datasets/banking77/default/1.1.0/9898c11f6afa9521953d2ef205667b527bad14ef9cab445d470f16240c8c8ec4)
100%|██████████| 2/2 [00:00<00:00, 517.50it/s]


# of train classes is 47
# of test classes is 30
OK: train dataset and test dataset are disjoint
Making train data is done
Making test data is done


In [80]:
# Dataset: medium post title -93
# source: 
OUTPUT_DIR = './data'
DATA_NM = 'medium'

def dir_check(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path+'/full'):
        os.makedirs(path+'/full')


def get_pandas_data(df):
    out = list()
    for idx in df.index:
        out.append({
            "global_ix" : idx,
            "sentence": df.loc[idx, 'title'],
            "label": str(df.loc[idx, 'category'])
        })
    return out

def write_jsonl_data(jsonl_data: List[Dict], jsonl_path: str, force=False):
    if os.path.exists(jsonl_path) and not force:
        raise FileExistsError
    with open(jsonl_path, 'w') as file:
        for line in jsonl_data:
            file.write(json.dumps(line, ensure_ascii=False) + '\n')

def write_full_txt_data(jsonl_data: List[Dict], txt_path: str, force=False):
    if os.path.exists(txt_path) and not force:
        raise FileExistsError
    with open(txt_path, 'w') as file:
        for line in jsonl_data:
            file.write(line['sentence'] + '\n')

            
df = pd.read_csv('./raw_data/medium_post_titles.csv')

le = preprocessing.LabelEncoder()
le.fit(df.category)
df['category'] = le.transform(df.category)

train_classes = list(range(93))
eval_classes = random.sample(train_classes,30)

for c in eval_classes:
    train_classes.remove(c)

train_df = df.loc[df['category'].isin(train_classes)]
eval_df = df.loc[df['category'].isin(eval_classes)]

train_cls = set(train_df['category'])
eval_cls = set(eval_df['category'])
print(f"# of train classes is {len(train_cls)}")
print(f"# of test classes is {len(eval_cls)}")
if len(eval_cls.intersection(train_cls)) == 0 and len(train_cls.intersection(eval_cls)) == 0:
    print("OK: train dataset and test dataset are disjoint")
else:
    print("WARN: train dataset and test dataset are not disjoint")
    
dir_check(OUTPUT_DIR + '/' +DATA_NM)
    
out = get_pandas_data(train_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/train.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-train.txt', force=True)
print("Making train data is done")


out = get_pandas_data(eval_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/test.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-test.txt', force=True)
print("Making test data is done")

# of train classes is 63
# of train classes is 30
WARN: train dataset and test dataset are not disjoint
Making train data is done
Making test data is done


In [120]:
# Dataset: 2020 r&d project dataset  - 133
# source: ntis
OUTPUT_DIR = './data'
DATA_NM = 'rnd'

def dir_check(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path+'/full'):
        os.makedirs(path+'/full')


def get_pandas_data(df):
    out = list()
    for idx in df.index:
        out.append({
            "global_ix" : idx,
            "sentence": df.loc[idx, 'ENG_PJT_NM'],
            "label": str(df.loc[idx, 'RSCH_AREA_CLS1_CD'])
        })
    return out

def write_jsonl_data(jsonl_data: List[Dict], jsonl_path: str, force=False):
    if os.path.exists(jsonl_path) and not force:
        raise FileExistsError
    with open(jsonl_path, 'w') as file:
        for line in jsonl_data:
            file.write(json.dumps(line, ensure_ascii=False) + '\n')

def write_full_txt_data(jsonl_data: List[Dict], txt_path: str, force=False):
    if os.path.exists(txt_path) and not force:
        raise FileExistsError
    with open(txt_path, 'w') as file:
        for line in jsonl_data:
            file.write(line['sentence'] + '\n')

            
df = pd.read_excel('./raw_data/sh_pjt_en_2020.xlsx',engine='openpyxl')
df = df[df['RSCH_AREA_CLS1_CD'].map(df['RSCH_AREA_CLS1_CD'].value_counts()) > 50]

le = preprocessing.LabelEncoder()
le.fit(df.RSCH_AREA_CLS1_CD)
df['RSCH_AREA_CLS1_CD'] = le.transform(df.RSCH_AREA_CLS1_CD)

train_classes = list(range(133))
eval_classes = random.sample(train_classes,30)

for c in eval_classes:
    train_classes.remove(c)

train_df = df.loc[df['RSCH_AREA_CLS1_CD'].isin(train_classes)]
eval_df = df.loc[df['RSCH_AREA_CLS1_CD'].isin(eval_classes)]

train_cls = set(train_df['RSCH_AREA_CLS1_CD'])
eval_cls = set(eval_df['RSCH_AREA_CLS1_CD'])
print(f"# of train classes is {len(train_cls)}")
print(f"# of train classes is {len(eval_cls)}")
if len(eval_cls.intersection(train_cls)) == 0 and len(train_cls.intersection(eval_cls)) == 0:
    print("OK: train dataset and test dataset are disjoint")
else:
    print("WARN: train dataset and test dataset are not disjoint")
    
dir_check(OUTPUT_DIR + '/' +DATA_NM)
    
out = get_pandas_data(train_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/train.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-train.txt', force=True)
print("Making train data is done")


out = get_pandas_data(eval_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/test.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-test.txt', force=True)
print("Making test data is done")

# of train classes is 103
# of train classes is 30
OK: train dataset and test dataset are disjoint
Making train data is done
Making test data is done


In [None]:
# Dataset: web of science   
#load_dataset("web_of_science",'WOS46985')

OUTPUT_DIR = './data'
DATA_NM = 'wos'

def dir_check(path):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.exists(path+'/full'):
        os.makedirs(path+'/full')


def get_pandas_data(df):
    out = list()
    for idx in df.index:
        out.append({
            "global_ix" : idx,
            "sentence": df.loc[idx, 'input_data'],
            "label": str(df.loc[idx, 'label'])
        })
    return out

def write_jsonl_data(jsonl_data: List[Dict], jsonl_path: str, force=False):
    if os.path.exists(jsonl_path) and not force:
        raise FileExistsError
    with open(jsonl_path, 'w') as file:
        for line in jsonl_data:
            file.write(json.dumps(line, ensure_ascii=False) + '\n')

def write_full_txt_data(jsonl_data: List[Dict], txt_path: str, force=False):
    if os.path.exists(txt_path) and not force:
        raise FileExistsError
    with open(txt_path, 'w') as file:
        for line in jsonl_data:
            file.write(line['sentence'] + '\n')

            

dataset = load_dataset("web_of_science",'WOS46985')

df = pd.DataFrame(dataset["train"])


train_classes = list(set(df.label))
eval_classes = random.sample(train_classes,30)

for c in eval_classes:
    train_classes.remove(c)

train_df = df.loc[df['label'].isin(train_classes)]
eval_df = df.loc[df['label'].isin(eval_classes)]

train_cls = set(train_df['label'])
eval_cls = set(eval_df['label'])
print(f"# of train classes is {len(train_cls)}")
print(f"# of test classes is {len(eval_cls)}")
if len(eval_cls.intersection(train_cls)) == 0 and len(train_cls.intersection(eval_cls)) == 0:
    print("OK: train dataset and test dataset are disjoint")
else:
    print("WARN: train dataset and test dataset are not disjoint")
    
dir_check(OUTPUT_DIR + '/' +DATA_NM)
    
out = get_pandas_data(train_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/train.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-train.txt', force=True)
print("Making train data is done")


out = get_pandas_data(eval_df)
write_jsonl_data(out, OUTPUT_DIR + '/' +DATA_NM +'/test.jsonl', force=True)
write_full_txt_data(out, OUTPUT_DIR + '/' +DATA_NM +'/full/full-test.txt', force=True)
print("Making test data is done")