In [None]:
import transformers
import torch
MY_HF_TOKEN = ""  # token ID

model_id = "HumanLLMs/Human-Like-LLama3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token = MY_HF_TOKEN
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

# LLM test
print(outputs[0]["generated_text"][-1])

config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

Device set to use cuda:0


{'role': 'assistant', 'content': "Arrrr, me hearty! Me name be Captain Chatbot, the scurviest pirate to ever sail the Seven Seas! 🏴\u200d☠️ I've got a treasure trove of tales and jokes to share, so hoist the sails and join me crew! 😄 What be bringin' ye to these waters? 🌊"}


In [None]:
import kagglehub
import pandas as pd
import os

# dataset download
path = kagglehub.dataset_download("mazlumi/ielts-writing-scored-essays-dataset")

print(f"dataset path: {path}")


file_list = os.listdir(path)
print(f"file list: {file_list}")

# loading csv file
csv_file_name = 'ielts_writing_dataset.csv'
csv_file_path = os.path.join(path, csv_file_name)

# loading to a dataframe
df = pd.read_csv(csv_file_path)

df_new = df[['Question', 'Essay']]
print(df_new.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/mazlumi/ielts-writing-scored-essays-dataset?dataset_version_number=1...


100%|██████████| 674k/674k [00:00<00:00, 748kB/s]

Extracting files...
데이터셋이 저장된 폴더 경로: /root/.cache/kagglehub/datasets/mazlumi/ielts-writing-scored-essays-dataset/versions/1
폴더 안의 파일 목록: ['ielts_writing_dataset.csv']

--- 선택된 열만 남은 데이터프레임 (After) ---
                                            Question  \
0  The bar chart below describes some changes abo...   
1  Rich countries often give money to poorer coun...   
2  The bar chart below describes some changes abo...   
3  Rich countries often give money to poorer coun...   
4  The graph below shows the number of overseas v...   

                                               Essay  
0  Between 1995 and 2010, a study was conducted r...  
1  Poverty represents a worldwide crisis. It is t...  
2  The left chart shows the population change hap...  
3  Human beings are facing many challenges nowada...  
4  Information about the thousands of visits from...  





In [None]:
df_unique = df_new.drop_duplicates(subset=['Question'], keep='first')
df_unique.reset_index(drop=True, inplace=True)

# dropping duplicated essays
print(f"origin length of dataset: {len(df)}")
print(f"new length of dataset: {len(df_unique)}")

원본 데이터 길이: 1435
중복 제거 후 데이터 길이: 402


In [None]:
#loading drive in order to save the result file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

pipeline.tokenizer.pad_token_id = pipeline.model.config.eos_token_id
pipeline.tokenizer.padding_side = 'left'

# setting range index
start_index = 0
end_index = 200

# appending prompts in a list
next_messages = []
for i in range(start_index, end_index):
    system_prompt = """You are an advanced AI writer. Your task is to compose a high-quality, well-structured argumentative essay."""
    user_prompt = f"""
    Topic: {df_unique['Question'][i]}

    Essay Requirements:
    - Length: The essay must be between 200 and 300 words.
    - Structure: The essay must include an introduction, body, and conclusion.
    - Tone: The tone should be formal, objective, and analytical.
    - Content: You must develop a clear thesis and build a coherent argument.
    - Format: Do not include a title. Begin directly with the first sentence of the introduction.

    Begin writing the essay now.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    next_messages.append(messages)

# batch processing
outputs = pipeline(
    next_messages,
    max_new_tokens=512,
    eos_token_id=terminators,
    do_sample=True,
    temperature=1,
    top_p=1,
    batch_size=8 # appropriate batch size for free colab account
)

# handling result essays
generated_essays = []
for output in outputs:
    assistant_response = output[0]["generated_text"][-1]
    generated_essays.append(assistant_response['content'])
results_df = pd.DataFrame({'Generated_Essay_humanlike': generated_essays})

print(f"{len(results_df)}: number of essays written")
print(results_df)

results_df.to_csv('/content/drive/MyDrive/generated_essays_humanlike.csv', index=False, encoding='utf-8-sig')




--- 최종 데이터프레임 ---
총 96개의 에세이가 저장되었습니다.
                         Generated_Essay_LLama_3_8B_2
0   Shopping online has become a norm for many of ...
1   The graph showing the number of university gra...
2   When it comes to dress code policies in the wo...
3   The population figures in Japan, as presented ...
4   The impact of colour on human emotions has bee...
..                                                ...
91  The debate about using physical force to disci...
92  The tourism industry has grown exponentially i...
93  Many argue that a national system of unemploym...
94  With the increasing reliance on technology, I ...
95  Many people think that a high salary is the ke...

[96 rows x 1 columns]
