# ChildGPT-3_BiClActPsv
https://www.kaggle.com/baekseungyun/gpt-2-with-huggingface-pytorch<br>


***troubleshooting***

https://github.com/jeffheaton/t81_558_deep_learning

https://www.youtube.com/watch?v=VEDy-c5Sk8Y

https://www.youtube.com/watch?v=o4-bI_iZKPA

https://github.com/jeffheaton/app_deep_learning/blob/main/install/pytorch-install-aug-2023.ipynb

https://www.linkedin.com/pulse/how-use-gpu-tensorflow-pytorch-libraries-macbook-pro-m2apple-kashyap

https://medium.com/mlearning-ai/mac-m1-m2-gpu-support-in-pytorch-a-step-forward-but-slower-than-conventional-nvidia-gpu-40be9293b898

https://developer.apple.com/metal/pytorch/

https://www.youtube.com/watch?v=Zx2MHdRgAIc

https://youtube.com/watch?v=mS2X1QmIUCI


In [None]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
import time
import datetime
import re
import os

from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import set_seed, GPT2LMHeadModel, PreTrainedTokenizerFast, GPT2ForSequenceClassification, GPT2Config
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from accelerate import init_empty_weights

In [None]:
### GPU check

# If there's GPU available...
print(torch.__version__)

print("mps available?", torch.backends.mps.is_available())
print("mps built?", torch.backends.mps.is_built())
print("CUDA enabled?", torch.cuda.is_available())

if torch.backends.mps.is_available():

    # let PyTorch use GPU
    mps_device = torch.device("mps")
    x = torch.ones(1, device = mps_device)
    print (x)

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
### Parameter setting
setEpoch = 10
setLearningRate = [0.001, 0.0001]
setBatch = [16, 64]
setMaxLength = [64, 256]
setPatching = [0, 0.25, 0.50, 0.75, 1]
setFineTuning = [0.25, 0.50, 0.75, 1]

setEpsilon = 1e-8
setSeed = 42
labelNumber = 2
setTry = 31

In [None]:
#print(torch.mps.current_allocated_memory)
#print(torch.mps.driver_allocated_memory)

In [None]:
#PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [None]:
### Modelling

for currentLearningRate in setLearningRate:

    for currentBatch in setBatch:

        for currentMaxLength in setMaxLength:
            
            for currentPatching in setPatching:
                
                for currentFineTuning in setFineTuning:

                    for currentTry in range(1,setTry):

                        ### 1. load model & tokeniser (https://github.com/SKT-AI/KoGPT2)

                        tokenizer = AutoTokenizer.from_pretrained(
                            'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b', #revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
                            bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
                        )
                        model = AutoModelForSequenceClassification.from_pretrained(
                            'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b', #revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
                            pad_token_id=tokenizer.eos_token_id, torch_dtype='auto', low_cpu_mem_usage=True).to(device='mps', non_blocking=True)
                        _ = model.eval()

                        tokenizer.padding_side = "left" # Very Important
                        tokenizer.pad_token = tokenizer.eos_token

                        print("Before patching: ",len(tokenizer.get_vocab()))

                        # finetuning pre-trained model using CHILDES
                        entireCorpus = "./Data/CHILDES/CHILDES_ALL.txt"
                        entireFr = open(entireCorpus, 'r')
                        entireContents = entireFr.readlines()
                        entireFr.close()


                        def RandomSelect(dataList, prob):
                            dataSize = int(len(dataList) * prob)

                            dataListRD = []
                            for i in range(0, dataSize):
                                randomNum = random.randrange(0, len(dataList))
                                dataListRD.append(dataList[randomNum])
                                del dataList[randomNum]

                            return dataListRD


                        entireSet = set()
                        entireContentList = list()

                        entireContentRD = RandomSelect(entireContents, float(currentPatching))

                        for entireContent in entireContentRD:
                            entireContent = re.sub('[^가-힣]', ' ', entireContent)
                            entireContent = re.sub('[\s]+', ' ', entireContent)
                            entireContentList.append(entireContent.replace("\n",""))

                            entireContentSplit = entireContent.split(" ")

                            for each in entireContentSplit:
                                if each != "":
                                    entireSet.add(each)

                        wordDic = {}
                        for eachWord in entireSet:
                            wordDic[eachWord] = 0

                        for contentEach in entireContentList:
                            contentEachSplit = contentEach.split(" ")
                            for each in contentEachSplit:
                                if each != "":
                                    wordDic[each] = wordDic[each] + 1

                        wordDicSorted = dict(sorted(wordDic.items(), key=lambda x: x[1], reverse=True))

                        countNum = 1
                        for key, value in wordDicSorted.items():
                            if value > 1:
                                countNum = countNum + 1
                                tokenizer.add_tokens([key])

                        print("After patching: ",len(tokenizer.get_vocab()))

                        model.resize_token_embeddings(len(tokenizer))
                        model.config.pad_token_id = model.config.eos_token_id


                        ### 2. Build Dataset
                        trainCorpus = "./Data/trainBi_20231107.csv"
                        trainFr = open(trainCorpus, 'r')
                        trainContents = trainFr.readlines()
                        trainFr.close()

                        train_List = []

                        #데이터 위의 변수 정보 삭제
                        trainVar = trainContents[0]
                        del trainContents[0]
                        trainContentRD = RandomSelect(trainContents, float(currentFineTuning))

                        for eachContent in trainContentRD:
                            eachSplit = eachContent.replace("\n","").split(",")
                            train_List.append(eachSplit)

                        trainPD = pd.DataFrame(train_List)
                        trainPD.columns = trainVar.replace("\n","").split(",")



                        testCorpus = "./Data/testBi_20231107.csv"
                        testFr = open(testCorpus, 'r')
                        testContents = testFr.readlines()
                        testFr.close()

                        test_List = []

                        #데이터 위의 변수 정보 삭제
                        testVar = testContents[0]
                        del testContents[0]

                        for eachContent in testContents:
                            eachSplit = eachContent.replace("\n","").split(",")
                            test_List.append(eachSplit)

                        testPD = pd.DataFrame(test_List)
                        testPD.columns = testVar.replace("\n","").split(",")


                        ### 2. Build Dataset

                        class Dataset(Dataset):
                            def __init__(self, Dtype, dataIn):
                                self.Dtype = Dtype
                                self.data = dataIn

                            def __len__(self):
                                return len(self.data)

                            def __getitem__(self, index):
                                record = self.data.iloc[index]
                                text = record['Sentence']
                                if self.Dtype == "train":
                                    return {'Sentence': text, 'label': record['Label']}
                                else:
                                    return {'Sentence': text, 'label': '0'}

                        train_dataset = Dataset("train", trainPD)
                        test_dataset = Dataset("test", testPD)



                        ### 3. Data Collator

                        class Gpt3ClassificationCollator(object):
                            def __init__(self, tokenizer, max_seq_len=None):
                                self.tokenizer = tokenizer
                                self.max_seq_len = max_seq_len

                                return

                            def __call__(self, sequences):
                                texts = [sequence['Sentence'] for sequence in sequences]
                                labels = [int(sequence['label']) for sequence in sequences]
                                inputs = self.tokenizer(text=texts,
                                                        return_tensors='pt',
                                                        padding=True,
                                                        truncation=True,
                                                        max_length=self.max_seq_len)
                                inputs.update({'labels': torch.tensor(labels)})

                                return inputs

                        gpt3classificationcollator = Gpt3ClassificationCollator(tokenizer=tokenizer, max_seq_len=currentMaxLength)


                        ### 4. DataLoader

                        train_size = int(len(train_dataset) * 0.8)
                        val_size = len(train_dataset) - train_size
                        train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

                        train_dataloader = DataLoader(dataset=train_dataset,
                                                      batch_size=currentBatch,
                                                      shuffle=True,
                                                      collate_fn=gpt3classificationcollator)
                        val_dataloader = DataLoader(dataset=val_dataset,
                                                    batch_size=currentBatch,
                                                    shuffle=False,
                                                    collate_fn=gpt3classificationcollator)
                        test_dataloader = DataLoader(dataset=test_dataset,
                                                    batch_size=currentBatch,
                                                    shuffle=False,
                                                    collate_fn=gpt3classificationcollator)


                        ### 5. Optimiser & Lr Scheduler

                        total_epochs = setEpoch

                        param_optimizer = list(model.named_parameters())
                        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
                        optimizer_grouped_parameters = [
                            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
                        ]
                        optimizer = AdamW(optimizer_grouped_parameters,
                                          lr=currentLearningRate,
                                          eps=setEpsilon)

                        num_train_steps = len(train_dataloader) * total_epochs
                        num_warmup_steps = int(num_train_steps * 0.1)

                        lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                                      num_warmup_steps=num_warmup_steps,
                                                                      num_training_steps = num_train_steps)


                        ###6. Train & Validation

                        def train(dataloader, optimizer, scheduler, device_):
                            global model
                            model.train()

                            prediction_labels = []
                            true_labels = []

                            total_loss = []

                            for batch in dataloader:
                                true_labels += batch['labels'].numpy().flatten().tolist()
                                batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}


                                outputs = model(**batch)
                                loss, logits = outputs[:2]
                                logits = logits.detach().cpu().numpy()
                                total_loss.append(loss.item())

                                optimizer.zero_grad()
                                loss.backward()
                                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # prevent exploding gradient

                                optimizer.step()
                                scheduler.step()

                                prediction_labels += logits.argmax(axis=-1).flatten().tolist()

                            return true_labels, prediction_labels, total_loss

                        def validation(dataloader, device_):
                            global model
                            model.eval()

                            prediction_labels = []
                            true_labels = []

                            embedding_outputs = []

                            total_loss = []

                            outputs = []

                            for batch in dataloader:
                                true_labels += batch['labels'].numpy().flatten().tolist()
                                batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}

                                with torch.no_grad():
                                    outputs = model(**batch)
                                    loss, logits = outputs[:2]
                                    logits = logits.detach().cpu().numpy()
                                    total_loss.append(loss.item())

                                    prediction_labels += logits.argmax(axis=-1).flatten().tolist()

                                    embedding_outputs += logits.tolist()

                                    outputs = outputs

                            return true_labels, prediction_labels, total_loss, outputs, embedding_outputs

                        def outreault(guess):
                            guess = int(guess)
                            outConstruction = ""
                            if guess == 0:
                                outConstruction = "agent-first"
                            elif guess == 1:
                                outConstruction = "theme-first"

                            return outConstruction


                        ### 7. Run

                        outDir = "./Output/GPT/gpt3Bi_actpsv_LR" + str(currentLearningRate) + "_Batch" + str(currentBatch) + "_SL" + str(currentMaxLength) + "_PC" + str(currentPatching) + "_FT" + str(currentFineTuning) + "_T" + str(currentTry) + ".csv"
                        f = open(outDir, 'w')
                        f.write("epoch,sentence,originalLabel,predictedLabel,predictedConstruction,result"+"\n")

                        device = 'mps' if torch.backends.mps.is_available() else 'cpu'
                        model.to(device)

                        all_loss = {'train_loss': [], 'val_loss': []}
                        all_acc = {'train_acc': [], 'val_acc': []}
                        outputs = []

                        for epoch in range(total_epochs):

                            y, y_pred, train_loss = train(train_dataloader, optimizer, lr_scheduler, device)
                            train_acc = accuracy_score(y, y_pred)

                            y, y_pred, val_loss, outputs, logits_labels = validation(val_dataloader, device)
                            val_acc = accuracy_score(y, y_pred)

                            all_loss['train_loss'] += train_loss
                            all_loss['val_loss'] += val_loss

                            all_acc['train_acc'].append(train_acc)
                            all_acc['val_acc'].append(val_acc)

                            outputs = outputs

                            print('======== Epoch {:} / {:} ========'.format(epoch + 1, total_epochs))
                            #print('Training...')

                            print(f'Epoch: {epoch}, train_loss: {torch.tensor(train_loss).mean():.3f}, train_acc: {train_acc:.3f}, val_loss: {torch.tensor(val_loss).mean():.3f}, val_acc: {val_acc:.3f}')

                            y, y_pred, val_loss, outputs, logits_labels = validation(test_dataloader, device)

                            testFileDir = fileDir = "./Data/testBi_20231107.csv"
                            testFr = open(testFileDir, 'r')
                            testContents = testFr.readlines()
                            testFr.close()

                            test = pd.DataFrame(columns=('Label', 'Sentence'))
                            i = 0
                            for content in testContents:
                                if i == 0:
                                    pass
                                else:
                                    infos = content.split(",")
                                    label = int(infos[0])
                                    sentence = infos[1].replace("\n", "")
                                    test.loc[i] = [label, sentence]
                                i = i + 1

                            test['Sentence'] = test['Sentence'].str.replace(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\\n\t]+', " ", regex=True)
                            test['Sentence'] = test['Sentence'].str.replace(r'\t+', " ", regex=True)
                            test['Sentence'] = test['Sentence'].str.replace(r'[\\n]+', " ", regex=True)

                            # import test sents
                            testSentences = test['Sentence']

                            totalNum = 0
                            correctNum = 0
                            for each in range(0, len(testSentences)):
                                #print(test['Label'][each + 1], test['Sentence'][each + 1])
                                #print("y_pred", len(y_pred))
                                guess = str(y_pred[each])
                                if guess == str(test['Label'][each + 1]):
                                    #print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(O)")
                                    f.write(str(epoch+1) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess)+ ",1" + "\n")
                                    correctNum = correctNum + 1
                                else:
                                    f.write(str(epoch+1) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess) + ",0" + "\n")
                                    #print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(X)")
                                totalNum = totalNum + 1

                            #print("totalNum: ", totalNum, " correctNum: ", correctNum, " accuracy: ", (correctNum/totalNum))

                        f.close()

                        print("Training complete!")