In [39]:
import requests
import ollama
# Ollama 서버 설정
ollama_host = "http://sg040:11435"
client = ollama.Client(host=ollama_host)  # 클라이언트 인스턴스 생성
prompt = "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."
# "Analyze the given image and describe only the specific actions and interactions of the people in the scene. Focus on what they are doing, their gestures, expressions, and interactions, rather than general details about the environment or objects. Avoid assumptions beyond what is explicitly shown in the image."

try:
    response = requests.get(ollama_host)
    print("Server connected")
    print(response.text)
    
except requests.ConnectionError:
    print("Not connected")

Server connected
Ollama is running


In [40]:
import base64

with open("./image.png", "rb") as img_file:
    base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
response = client.chat(
    model='llama3.2-vision', # need a100:1 gpu
    # model='llama3.2-vision:90b', # need a100:2 gpu
    messages=[{
        'role': 'user',
        'content': prompt,
        'images': [base64_image]  # Base64 문자열 전달
    }]
)

print(response['message']['content'])

This image presents a two-shot collage of the same scene, with the top shot featuring an empty desk and chair, while the bottom shot captures a woman seated behind the desk.

In the top frame, a dark wood table sits on a tan carpeted floor, accompanied by a matching wooden chair. The room is illuminated by natural light pouring in from large windows, which are adorned with sheer white curtains. A stack of papers sits on the left side of the desk, and a small leather bag rests on the right.

In contrast, the bottom frame reveals a woman, dressed in a dark gray blazer over a black turtleneck, sitting behind the desk with her hands clasped together. Her face is blurred, but her attire suggests she may be an attorney or businesswoman. The background of this image is identical to that of the top shot.

Overall, this image appears to capture a professional setting, possibly a law office or meeting room, where someone has left their belongings and perhaps taken a short break.


In [42]:
import os
import json
import base64
import ollama
import time
import traceback
from pathlib import Path
from datetime import datetime

# 경로 설정
image_dir = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/cropped_img')
output_file = Path('/scratch/jsong132/Can_LLM_Learn_New_Language/result.json')

# 출력 디렉토리 생성
output_file.parent.mkdir(parents=True, exist_ok=True)

# 로깅 설정
def log(message, level="INFO"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {message}")

# 초기화
results = []
VALID_EXTENSIONS = ('.png', '.jpg', '.jpeg')
total_images = len([f for f in image_dir.iterdir() if f.suffix.lower() in VALID_EXTENSIONS])
processed = 0

log(f"Starting image processing for {total_images} images")

# 이미지 처리
for image_path in image_dir.iterdir():
    if not (image_path.is_file() and image_path.suffix.lower() in VALID_EXTENSIONS):
        continue

    processed += 1
    log(f"Processing image ({processed}/{total_images}): {image_path.name}")
    start_time = time.time()
    
    try:
        # 이미지 인코딩
        encode_start = time.time()
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode("utf-8")
        encode_time = time.time() - encode_start
        log(f"Image encoded in {encode_time:.2f}s")

        # API 요청
        api_start = time.time()
        response = client.chat(
            model="llama3.2-vision",
            # model='llama3.2-vision:90b',
            messages=[{
                'role': 'user',
                'content': prompt,
                'images': [base64_image]
            }]
        )
        api_time = time.time() - api_start
        log(f"API response received in {api_time:.2f}s")

        # 결과 저장
        results.append({
            'image': str(image_path),
            'response': response['message']['content'],
            'processing_time': {
                'encoding': encode_time,
                'api_call': api_time,
                'total': time.time() - start_time
            },
            'status': 'success'
        })

    except Exception as e:
        error_msg = f"Error processing {image_path.name}: {str(e)}"
        error_trace = traceback.format_exc()
        log(error_msg, "ERROR")
        log(f"Error details:\n{error_trace}", "DEBUG")
        
        results.append({
            'image': str(image_path),
            'error': error_msg,
            'error_trace': error_trace,
            'status': 'failed'
        })

# 결과 저장
save_start = time.time()
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
save_time = time.time() - save_start

log(f"Results saved to {output_file} in {save_time:.2f}s")
log(f"Processing completed. Success: {len([x for x in results if x['status']=='success'])}, Failed: {len([x for x in results if x['status']=='failed'])}")

[2025-03-03 19:08:41] [INFO] Starting image processing for 824 images
[2025-03-03 19:08:41] [INFO] Processing image (1/824): cropped_00_37_23.png
[2025-03-03 19:08:41] [INFO] Image encoded in 0.02s
[2025-03-03 19:08:45] [INFO] API response received in 4.03s
[2025-03-03 19:08:45] [INFO] Processing image (2/824): cropped_00_34_42.png
[2025-03-03 19:08:45] [INFO] Image encoded in 0.01s
[2025-03-03 19:08:48] [INFO] API response received in 2.70s
[2025-03-03 19:08:48] [INFO] Processing image (3/824): cropped_00_14_58.png
[2025-03-03 19:08:48] [INFO] Image encoded in 0.05s
[2025-03-03 19:08:51] [INFO] API response received in 2.97s
[2025-03-03 19:08:51] [INFO] Processing image (4/824): cropped_00_03_16.png
[2025-03-03 19:08:51] [INFO] Image encoded in 0.04s
[2025-03-03 19:08:54] [INFO] API response received in 2.77s
[2025-03-03 19:08:54] [INFO] Processing image (5/824): cropped_00_50_59.png
[2025-03-03 19:08:54] [INFO] Image encoded in 0.06s
[2025-03-03 19:08:57] [INFO] API response received

In [45]:
# Sort by the timestamp

import json
from pathlib import Path

input_file = Path('result.json')
output_file = Path('result_origin.json')

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

def process_item(item):
    try:
        filename = Path(item['image']).name
        
        # 파일명 구조: cropped_00_02_45.png → ['cropped', '00', '02', '45.png']
        parts = filename.split('_')
        
        # 시간 정보 추출 (시, 분, 초)
        hours = int(parts[1])          # 첫 번째 숫자 (00)
        mins = int(parts[2])           # 두 번째 숫자 (02)
        secs = int(parts[3].split('.')[0])  # 세 번째 숫자 (45.png → 45)

        return {
            'image': filename,
            'time_seconds': hours*3600 + mins*60 + secs,
            'response': item['response'],
            'status': item['status']
        }
    except Exception as e:
        print(f"파일명 형식 오류: {filename} → {str(e)}")
        return None

# 데이터 처리 (오류 항목 필터링)
processed_data = [item for item in (process_item(i) for i in data) if item is not None]

# 시간 순 정렬
sorted_data = sorted(processed_data, key=lambda x: x['time_seconds'])

# 최종 출력 형식
final_data = [
    {
        'image': item['image'],
        'response': item['response'],
        'status': item['status']
    }
    for item in sorted_data
]

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

print(f"정렬 완료! 결과 파일: {output_file}")

정렬 완료! 결과 파일: result_origin.json


In [20]:
# OLMo Test

In [None]:
# From HF
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 모델 & 토크나이저 로드 (OLMo는 trust_remote_code 필요)
model = AutoModelForCausalLM.from_pretrained(
    "allenai/OLMo-7B-hf",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B-hf")

# 추론 파라미터 설정
question = """Hello My name is Jun,
My idea to make LLM learn new language is giving them a circumstance information with the sentences of language.
Answer:"""

inputs = tokenizer(
    question,
    return_tensors="pt",
    max_length=256,
    truncation=True
)

# 생성 설정
outputs = model.generate(
    inputs.input_ids.to(model.device),
    max_new_tokens=150,
    temperature=0.3,  # 창의성 ↓ → 논리적 답변 ↑
    top_p=0.95,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id  # OLMo 토크나이저 이슈 방지
)

# 결과 디코딩
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
# From Ollmo
import ollama

prompt = "한국말 할줄 알아?"

# 요청 전송
response = client.chat(
    model='darkmoon/olmo',
    messages=[{
        'role': 'user',
        'content': prompt,
    }]
)

print(response['message']['content'])


English translation: How to say what time is it in Korean.
How do you say 'where is the airport' in Korean?
English translation: Where is the airport in Korean?
How do you say 'what's your name' in Korean?
English translation: What's your name in Korean?
