In [None]:
!pip install transformers datasets peft trl pandas torch accelerate scikit-learn bitsandbytes==0.39.0

import os
import torch
import pandas as pd
from huggingface_hub import HfFolder
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from sklearn.model_selection import train_test_split

# Hugging Face 로그인 / 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
from huggingface_hub import login
login()


In [None]:
# 모델 로드 및 토크나이저
model_id = "distilgpt2"
compute_dtype = getattr(torch, "float16")

# 4비트 양자화
quant_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=False )
base_model = AutoModelForCausalLM.from_pretrained(
    model_id, low_cpu_mem_usage=True,
    quantization_config=quant_config, device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# 데이터셋 로드
from google.colab import drive
drive.mount('/content/gdrive')
dataset = load_dataset("json", data_files="/content/gdrive/My Drive/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json")
df = dataset["train"].to_pandas()


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Mounted at /content/gdrive


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# 데이터셋 헤드 출력
df.head()

Unnamed: 0,conversations,id
0,"[{'from': 'human', 'markdown': None, 'text': N...",QWJhYvA_0
1,"[{'from': 'human', 'markdown': None, 'text': N...",i6IyJda_0
2,"[{'from': 'human', 'markdown': None, 'text': N...",A5AbcES_0
3,"[{'from': 'human', 'markdown': None, 'text': N...",hRPPgZT_0
4,"[{'from': 'gpt', 'markdown': None, 'text': Non...",hRPPgZT_11


In [None]:
# 데이터 전처리 함수
def processData(data):
    conversation_id = str(data['id'])
    target = data['conversations']

    text = ""
    for conversation in target:
        talker = conversation.get('from', 'Unknown')
        talking = conversation.get('value', 'No content')
        text += "From: " + talker + "\nContent: " + talking + "\n\n"
    return text

# 데이터 전처리
df["text"] = df.apply(lambda x: processData(x), axis=1)
processed_data = Dataset.from_pandas(df[["text"]])

# 전처리 데이터 헤드 출력
print(df.iloc[0]['text'])

From: human
Content: Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...

From: gpt
Content: Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:

1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.
2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.
3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.
4. Post-launch: Follow up with customers, gather feedback, and continue to provide value to keep them engaged and loyal.
5. Create a prod

In [None]:
print("데이터 구조")
print(dataset)
print("전처리 후")
print(processed_data)

# 전체 데이터를 훈련용과 평가용으로 분할
final_data = processed_data.train_test_split(test_size=0.1)

데이터 구조
DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 94145
    })
})
전처리 후
Dataset({
    features: ['text'],
    num_rows: 94145
})


In [None]:
# 모델 훈련 파라미터 설정

# Lora 설정
config = LoraConfig(
 r=8,
 lora_alpha=16,
 target_modules=["c_attn", "c_proj", "lm_head"],
 lora_dropout=0.05,
 bias="lora_only",
 task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, config)

# Training 인자 설정
training_arguments = TrainingArguments(
    output_dir="sohui/nlpmodel",
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    learning_rate=5e-4,
    lr_scheduler_type="constant",
    weight_decay=0.01,
    logging_dir="sohui/nlpmodel/logs",
    logging_steps=200,
    push_to_hub=True,
    report_to="tensorboard",
    do_train=True,
    do_eval = True,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    load_best_model_at_end=True,
    fp16=False,
)


# Train 가능한 파라미터 출력
model.print_trainable_parameters()


trainable params: 813,704 || all params: 82,726,280 || trainable%: 0.9836100450787827


In [None]:
from transformers import EarlyStoppingCallback

# Trainer 설정
trainer = SFTTrainer(
  model=model,
  peft_config=config,
  args=training_arguments,
  train_dataset=final_data['train'],
  eval_dataset=final_data['test'],
  dataset_text_field="text",
  tokenizer=tokenizer,
  packing=False,
  max_seq_length=500,
  callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 모델 학습
trainer.train()


Map:   0%|          | 0/84730 [00:00<?, ? examples/s]

Map:   0%|          | 0/9415 [00:00<?, ? examples/s]

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,3.8324,3.126953
400,3.4036,2.986328
600,3.3137,2.941406
800,3.2844,2.888672
1000,3.2258,2.851562
1200,3.2166,2.828125
1400,3.1544,
1600,0.0,
1800,0.0,


TrainOutput(global_step=1800, training_loss=2.603435736762153, metrics={'train_runtime': 574.4539, 'train_samples_per_second': 147.497, 'train_steps_per_second': 36.875, 'total_flos': 936044482958784.0, 'train_loss': 2.603435736762153, 'epoch': 0.08})

In [None]:
# 모델 Hugging face 업로드

TOKEN = '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'

trainer.push_to_hub()
model.push_to_hub("sohui/nlpmodel", use_temp_dir=True, use_auth_token=TOKEN)
tokenizer.push_to_hub("sohui/nlpmodel", use_temp_dir=True, use_auth_token=TOKEN)




CommitInfo(commit_url='https://huggingface.co/sohui/nlpmodel/commit/45025ef24259c0963e151326b96ed170f5cfea97', commit_message='Upload tokenizer', commit_description='', oid='45025ef24259c0963e151326b96ed170f5cfea97', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# 모델 테스트 함수
def conversation_with_model(model_name, tokenizer):
  conversation_history = "User: Hello, how are you? \nAI: I'm good, thank you! How can I help you today?"

  for i in range(3):
    # 사용자 입력 받기
    user_input = input("User: ")
    if user_input.lower() == "quit":
      break

    # 대화 맥락 업데이트
    conversation_history += f"\nUser: {user_input} \nAI: "

    # 토큰화 및 응답 생성
    input_ids = tokenizer.encode(conversation_history, return_tensors='pt')
    output = model_name.generate(input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2)

    # 생성된 응답 추출 및 출력
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print("AI:", response.split("AI:")[-1].strip())

    # 대화 맥락 업데이트
    conversation_history = response

    # 대화 맥락이 너무 길어지면 일부 잘라내기
    if len(conversation_history) > 1000:
        conversation_history = conversation_history[-1000:]

In [None]:
# distilgpt2 model 테스트
conversation_with_model(base_model, tokenizer)

User: how are you gpt




AI: I am a gt 

AI, I have a new gts,  i have an old gss. I want to help me to get me a good gty  to do my work.

I want you to be happy, i want me  and i can help my  self  in my task.  my i am happy to have me in the task  the  you want  is a  gtk  that I can do  for me. i  want my self to  be  happy  if i do the work  it is  a Gtk that i could do for you. my I would  like to give me an  Gts that  can  do a work for  me, if I do it  an i would like me and  would want a bg  with  s  of  b g  (  {  }  )  //  `
 `
From: gx
Content: What is the gtz? What are the Gtz? How are they ?
``` ```.
`From gz
The gdt :
1. The gd  are a small  small, small and small. They are  different  from the other  types of ggts. It is an example of a different type of Gty, a type that is different from a other type. This type is called a `gtz`. The ` gds  have different types, and they are different in their type, but they have the same type and the type they do. These types are called ``gds`. These type are 

In [None]:
# fine-tuned model 테스트
my_model = AutoModelForCausalLM.from_pretrained("sohui/nlpmodel")
my_tokenizer = AutoTokenizer.from_pretrained("sohui/nlpmodel", trust_remote_code=True)

conversation_with_model(my_model, my_tokenizer)



User: how are you model
AI: I am a model, I have a new model. I want to help me in this case. 


From: gpt
Content: Here's a list of the models I can model:

1. The model I created is a simple model that is designed to be used for the model you are using.
2. It is an example of a type of model in a class that can be applied to a specific type. You can use the example to create a function that will use a different type to use it. Here is the list:

1
3. A function called a constructor that returns a string of parameters that you can specify to the constructor. This function is called to return a String of parameter parameters you could specify. These parameters are used to specify the type and type you would specify in the function. For example, you might want a parameter parameter to include a value in your function, such as a number of values.

2
4. An example that I could use to define a method that takes a variable of input and returns an array of variables that are not available to