In [3]:
###############################################
# Extract the circumstance from the image
################################################

import requests
import ollama
# Ollama 서버 설정
ollama_host = "http://sg003:11434"
client = ollama.Client(host=ollama_host)  # 클라이언트 인스턴스 생성
prompt = "Don't guess it is drama or tv show. Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."
# "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."

try:
    response = requests.get(ollama_host)
    print("Server connected")
    print(response.text)
    
except requests.ConnectionError:
    print("Not connected")

Server connected
Ollama is running


In [4]:
###############################################
# Test llama vision working
# Must choose a100:2, for llama3.2-vision:90b
# a100:1 works for llama3.2-vision
################################################

import base64

with open("./image.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    model='llama3.2-vision', # need a100:1 gpu
    # model='llama3.2-vision:90b', # need a100:2 gpu
    # model='llama3.2-vision', # need a100:1 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

In this image, a woman sits across from a man in an office setting.

The woman, with dark hair pulled back, wears a black turtleneck under a gray jacket and sits in a large, high-backed chair at a desk. Her hands are clasped together on the surface in front of her. The older gentleman, wearing a dark suit and white shirt, faces away from the camera towards the woman.

The background features beige walls with windows dressed in sheer curtains, contributing to a formal atmosphere.


In [6]:
import os
import json
import base64
import ollama
import time
import traceback
from pathlib import Path
from datetime import datetime

# 경로 설정
image_dir = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/cropped_img')
output_file = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/result.json')
used_model = ""

# 출력 디렉토리 생성
output_file.parent.mkdir(parents=True, exist_ok=True)

# 로깅 설정
def log(message, level="INFO"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {message}")

# 초기화
results = []
VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
total_images = len([f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS])
processed = 0

log(f"Starting image processing for {total_images} images")

# 이미지 처리
for image_path in image_dir.iterdir():
    if not (image_path.is_file() and image_path.suffix.lower() in VALID_EXTENSIONS):
        continue

    processed += 1
    log(f"Processing image ({processed}/{total_images}): {image_path.name}")
    start_time = time.time()
    
    try:
        # 이미지 인코딩
        encode_start = time.time()
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode("utf-8")
        encode_time = time.time() - encode_start
        log(f"Image encoded in {encode_time:.2f}s")

        # API 요청
        api_start = time.time()
        used_model = "llama3.2-vision"
        response = client.chat(
        ###################### Choose Model ###################
            model="llama3.2-vision",
            # "llama3.2-vision:90b"
            # llama3.2-vision"
            messages=[{
                'role': 'user',
                'content': prompt,
                'images': [base64_image]
            }]
        )
        api_time = time.time() - api_start
        log(f"API response received in {api_time:.2f}s")

        # 결과 저장
        results.append({
            'image': str(image_path),
            'response': response['message']['content'],
            'processing_time': {
                'encoding': encode_time,
                'api_call': api_time,
                'total': time.time() - start_time
            },
            'status': 'success'
        })

    except Exception as e:
        error_msg = f"Error processing {image_path.name}: {str(e)}"
        error_trace = traceback.format_exc()
        log(error_msg, "ERROR")
        log(f"Error details:\n{error_trace}", "DEBUG")
        
        results.append({
            'image': str(image_path),
            'error': error_msg,
            'error_trace': error_trace,
            'status': 'failed'
        })

# 결과 저장
save_start = time.time()
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
save_time = time.time() - save_start

log(f"Results saved to {output_file} in {save_time:.2f}s")
log(f"Processing completed. Success: {len([x for x in results if x['status']=='success'])}, Failed: {len([x for x in results if x['status']=='failed'])}")

[2025-03-04 19:12:16] [INFO] Starting image processing for 824 images
[2025-03-04 19:12:16] [INFO] Processing image (1/824): cropped_00_37_23.png
[2025-03-04 19:12:16] [INFO] Image encoded in 0.00s
[2025-03-04 19:12:32] [INFO] API response received in 15.39s
[2025-03-04 19:12:32] [INFO] Processing image (2/824): cropped_00_34_42.png
[2025-03-04 19:12:32] [INFO] Image encoded in 0.18s
[2025-03-04 19:12:35] [INFO] API response received in 3.08s
[2025-03-04 19:12:35] [INFO] Processing image (3/824): cropped_00_14_58.png
[2025-03-04 19:12:35] [INFO] Image encoded in 0.07s
[2025-03-04 19:12:38] [INFO] API response received in 2.79s
[2025-03-04 19:12:38] [INFO] Processing image (4/824): cropped_00_03_16.png
[2025-03-04 19:12:38] [INFO] Image encoded in 0.06s
[2025-03-04 19:12:41] [INFO] API response received in 2.59s
[2025-03-04 19:12:41] [INFO] Processing image (5/824): cropped_00_50_59.png
[2025-03-04 19:12:41] [INFO] Image encoded in 0.07s
[2025-03-04 19:12:43] [INFO] API response receive

In [8]:
# Sort by the timestamp

import json
from pathlib import Path

output = 'result_origin_v8.json'
output_pretty = 'result_pretty_v8.json'

input_file = Path('result.json')
output_file = Path(output)

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

def process_item(item):
    try:
        filename = Path(item['image']).name
        
        # 파일명 구조: cropped_00_02_45.png → ['cropped', '00', '02', '45.png']
        parts = filename.split('_')
        
        # 시간 정보 추출 (시, 분, 초)
        hours = int(parts[1])          # 첫 번째 숫자 (00)
        mins = int(parts[2])           # 두 번째 숫자 (02)
        secs = int(parts[3].split('.')[0])  # 세 번째 숫자 (45.png → 45)

        return {
            'image': filename,
            'time_seconds': hours*3600 + mins*60 + secs,
            'response': item['response'],
            'status': item['status']
        }
    except Exception as e:
        print(f"파일명 형식 오류: {filename} → {str(e)}")
        return None

# 데이터 처리 (오류 항목 필터링)
processed_data = [item for item in (process_item(i) for i in data) if item is not None]

# 시간 순 정렬
sorted_data = sorted(processed_data, key=lambda x: x['time_seconds'])

# 최종 출력 형식
final_data = [
    {
        'used_model': used_model,
        'image': sorted_data[0]['image'],
        'response': sorted_data[0]['response'],
        'status': sorted_data[0]['status']
    }
] + [
    {
        'image': item['image'],
        'response': item['response'],
        'status': item['status']
    }
    for item in sorted_data[1:]
]

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

print(f"정렬 완료! 결과 파일: {output_file}")

정렬 완료! 결과 파일: result_origin_v8.json


In [9]:
# merge_subtitle_w_vision_result.PY
import json

# result_origin.json 파일 로드
with open(output, "r", encoding="utf-8") as f:
    result_data = json.load(f)

# subtitle.json 파일 로드
with open("subtitle.json", "r", encoding="utf-8") as f:
    subtitle_data = json.load(f)

# dataset.json으로 저장할 데이터 리스트 초기화
dataset = []

# 두 파일의 데이터를 매칭하여 dataset 생성
for result_item, subtitle_item in zip(result_data, subtitle_data):
    input_text = subtitle_item.get("context", "")  # subtitle.json의 "context"를 input으로
    output_text = result_item.get("response", "")  # result_origin_v4.json의 "response"를 output으로
    
    # input과 output이 모두 비어있지 않은 경우만 추가
    if input_text and output_text:
        dataset.append({"input": input_text, "output": output_text})

# dataset.json 파일로 저장
with open("dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print("dataset.json 파일이 생성되었습니다.")

dataset.json 파일이 생성되었습니다.


In [10]:
###############################################
# json_pretty.py
# Make json file readible.
################################################

import json
import textwrap

# JSON 파일 로드
with open(output, 'r', encoding='utf-8') as f:
    data = json.load(f)

# response 값을 50자씩 나누어 배열로 분할
for item in data:
    if 'response' in item:
        # 50자씩 나누기
        wrapped_text = textwrap.wrap(item['response'], width=100)
        item['response'] = wrapped_text

# 수정된 JSON 파일 저장
with open(output_pretty, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("JSON 파일이 수정되어 저장되었습니다.")


JSON 파일이 수정되어 저장되었습니다.


In [3]:
###############################################
# Model train
################################################

from datasets import Dataset
import json

# dataset.json 파일 로드
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Hugging Face Dataset으로 변환
dataset = Dataset.from_dict({
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data]
})

dataset = dataset.train_test_split(test_size=0.1)  # 90% 학습, 10% 검증

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = "allenai/OLMo-7B"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(base_model)

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(base_model)

# 모델을 bfloat16로 캐스팅
model = model.to(torch.bfloat16)

# 모델을 GPU로 이동 (옵션)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 모델이 GPU로 이동되었는지 확인
print(f"Model is on device: {model.device}")
print(f"Model dtype: {model.dtype}")

The repository for allenai/OLMo-7B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/allenai/OLMo-7B.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  ㅛ
The repository for allenai/OLMo-7B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/allenai/OLMo-7B.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Model is on device: cuda:0
Model dtype: torch.bfloat16


In [5]:
def preprocess_function(examples):
    # 입력과 출력을 결합하여 토큰화
    inputs = [f"Input: {input}\nOutput: {output}" for input, output in zip(examples["input"], examples["output"])]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

# 데이터셋 전처리
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/741 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 학습 하이퍼파라미터 설정
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./fine-tuned-models/results",
    eval_strategy="epoch",
    learning_rate=1e-4,  # 안정적인 학습을 위해 감소
    per_device_train_batch_size=16,  # VRAM 허용 범위 내에서 증가
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,  # 배치 크기 증가로 인해 줄임
    num_train_epochs=15,  # 좀 더 충분한 학습
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=False, # 사양 좋다면 이걸 켜보는것도
    bf16=True, # 주로 안정적인 학습051,  # 학습 초반 안정성 증가
    lr_scheduler_type="cosine",  # 코사인 스케줄러 적용
    adam_beta1=0.9,
    adam_beta2=0.999,
)

# 데이터 콜레이터 초기화
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling 사용 여부 (False로 설정)
)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,  # 데이터 콜레이터 추가
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,5.2573,4.426455
2,3.2562,3.128247
3,2.6514,2.671775
4,2.3391,2.494674
5,1.9073,2.430858
6,1.6087,2.454844
7,1.1853,2.573993
8,0.7678,2.797199
9,0.4895,3.033847
10,0.3203,3.158739


TrainOutput(global_step=705, training_loss=1.46280672135928, metrics={'train_runtime': 2998.1424, 'train_samples_per_second': 3.707, 'train_steps_per_second': 0.235, 'total_flos': 2.281604873453568e+17, 'train_loss': 1.46280672135928, 'epoch': 15.0})

In [8]:
model.save_pretrained("./fine-tuned-models/fine-tuned-olmo-v10") # OLMoE-1B-7B-0924
tokenizer.save_pretrained("./fine-tuned-models/fine-tuned-olmo-v10") 

('./fine-tuned-models/fine-tuned-olmo-v10/tokenizer_config.json',
 './fine-tuned-models/fine-tuned-olmo-v10/special_tokens_map.json',
 './fine-tuned-models/fine-tuned-olmo-v10/tokenizer.json')