In [None]:
# ==========================================
# 1. IMPORT LIBRARIES
# ==========================================
import pandas as pd
import json
import os
import re
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from openai import OpenAI, APIConnectionError
from dotenv import load_dotenv

load_dotenv()
MY_API_KEY = os.getenv("OPEN_API_KEY")

MY_MODEL_ID = os.getenv("FINE_TUNED_MODEL")


In [3]:
# ==========================================
# 2. DEFINISI STOPWORDS & FUNGSI CLEANING
# ==========================================
STOPWORDS_ID = {
    'yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk', 'pada', 'dengan',
    'adalah', 'saya', 'kamu', 'dia', 'mereka', 'kita', 'akan', 'bisa', 'ada',
    'tidak', 'yg', 'ya', 'aja', 'gak', 'nya', 'kalo', 'kalau', 'udah', 'sudah',
    'bukan', 'tapi', 'tuh', 'dong', 'kok', 'sih', 'gue', 'lu', 'aku', 'apa',
    'bgt', 'banget', 'juga', 'lagi', 'mau', 'sama', 'banyak', 'bikin', 'buat',
    'jadi', 'terus', 'karena', 'seperti', 'atau', 'saat', 'dalam', 'masih',
    'begitu', 'semua', 'bbrp', 'utk', 'sdh', 'dgn', 'dr',
    'lebih', 'kemarin', 'per', 'punya', 'setelah', 'menjadi',
    'secara', 'lalu', 'memang', 'paling','nih','dulu','hingga','sampai','sekarang',
    'apakah','memiliki','kembali','sejak','kinerja','kira','coba','nder'
}

STOPWORDS_SAHAM = {
    'saham', 'harga', 'market', 'indeks', 'lot', 'hari', 'tahun',
    'beli', 'jual', 'hold', 'masuk', 'keluar', 'rekomendasi',
    'bbca', 'bbri', 'bmri', 'bbtn', 'tlkm', 'unvr', 'goto', 'bank', 'bca',
    'bbni', 'hmsp', 'tpia', 'bren', 'byan', 'indf', 'antm', 'asii','amp',
    'bri', 'mandiri', 'brpt', 'chandra', 'asri', 'tuck', 'wong','astra','ggrm',
    'ihsg', 'persero', 'tbk', 'lembar', 'pemegang', 'pbv', 'pukul', 'wib','pt',
    'syariah', 'low kwong','prajogo pangestu','indonesia','prajogo','pangestu','bayan','low',
    'kwong','adro','bisnis','emiten','rokok','unilever','rdtx','sampoerna','produk'
}

ALL_STOPWORDS = STOPWORDS_ID.union(STOPWORDS_SAHAM)

def load_and_clean_data(file_path):
    """Memuat dan membersihkan dataset."""
    print(f"Loading data from: {file_path}")
    if not os.path.exists(file_path):
        if os.path.exists(os.path.basename(file_path)):
            file_path = os.path.basename(file_path)
        else:
            raise FileNotFoundError("File tidak ditemukan.")

    df = pd.read_csv(file_path)

    # Hapus kolom tidak terpakai
    unused_cols = ['Quote Count', 'Reply Count', 'Retweet Count', 'Favorite Count', 'English Translation', 'Tweet Date']
    df = df.drop(columns=[c for c in unused_cols if c in df.columns], errors='ignore')

    # Fungsi Cleaning Text
    def clean_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'\[URL\]|\[HASHTAG\]|\[USERNAME\]', '', text) # Hapus placeholder
        text = text.lower()
        text = re.sub(r'[^a-z\s]', ' ', text) # Hapus non-huruf
        text = re.sub(r'\s+', ' ', text).strip()

        # Stopword removal
        tokens = [t for t in text.split() if t not in ALL_STOPWORDS]
        return ' '.join(tokens)

    print("Cleaning data...")
    df['cleaned_text'] = df['Sentence'].apply(clean_text)

    # Hapus duplikat & data kosong
    df = df.drop_duplicates(subset=['cleaned_text'])
    df = df[df['cleaned_text'].str.strip() != '']

    print(f"Data siap. Total baris: {len(df)}")
    return df[['cleaned_text', 'Sentiment']]

In [4]:
# ==========================================
# 3. FUNGSI KONVERSI KE JSONL (FORMAT OPENAI)
# ==========================================
def create_finetune_jsonl(df, output_file):
    """
    Mengubah DataFrame menjadi format JSONL yang diminta OpenAI:
    {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}
    """
    system_prompt = "Kamu adalah model analisis sentimen saham Indonesia. Jawab hanya dengan satu label: Positive, Negative, atau Neutral."

    with open(output_file, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            record = {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": row["cleaned_text"]},
                    {"role": "assistant", "content": row["Sentiment"]}
                ]
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    print(f"File JSONL berhasil dibuat: {output_file} ({len(df)} baris)")

In [None]:
# ==========================================
# 4. EKSEKUSI UTAMA (DATA PREPARATION)
# ==========================================

# A. LOAD DATA
FILE_PATH = 'data.csv'
df_clean = load_and_clean_data(FILE_PATH)

# B. SPLIT DATASET (80% Train, 10% Validation, 10% Test)
train_df, temp_df = train_test_split(
    df_clean,
    test_size=0.2,
    random_state=42,
    stratify=df_clean['Sentiment']
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['Sentiment']
)

print(f"\nDistribusi Data:")
print(f"Training Set   : {len(train_df)} ({len(train_df)/len(df_clean):.1%})")
print(f"Validation Set : {len(val_df)} ({len(val_df)/len(df_clean):.1%})")
print(f"Testing Set    : {len(test_df)} ({len(test_df)/len(df_clean):.1%})")

# C. SIMPAN FILE JSONL & CSV
create_finetune_jsonl(train_df, "sentiment_train.jsonl")
create_finetune_jsonl(val_df, "sentiment_val.jsonl")
test_df.to_csv("sentiment_test.csv", index=False)

Loading data from: data_cleaned_full.csv
Cleaning data...
Data siap. Total baris: 3141

Distribusi Data:
Training Set   : 2512 (80.0%)
Validation Set : 314 (10.0%)
Testing Set    : 315 (10.0%)
File JSONL berhasil dibuat: sentiment_train.jsonl (2512 baris)
File JSONL berhasil dibuat: sentiment_val.jsonl (314 baris)


In [None]:
# ==========================================
# 5. PROSES FINE-TUNING
# ==========================================
try:
    client = OpenAI(api_key=MY_API_KEY)

    print("\n--- 1. Memulai Upload File Training ---")
    # Upload Training File
    with open("sentiment_train.jsonl", "rb") as f:
        train_file = client.files.create(
            file=f,
            purpose="fine-tune"
        )
    print(f"[BERHASIL] Training File ID: {train_file.id}")

    print("\n--- 2. Memulai Upload File Validation ---")
    # Upload Validation File
    with open("sentiment_val.jsonl", "rb") as f:
        val_file = client.files.create(
            file=f,
            purpose="fine-tune"
        )
    print(f"[BERHASIL] Validation File ID: {val_file.id}")

    time.sleep(5)

    print("\n--- 3. Membuat Job Fine-Tuning ---")
    # Create Fine-Tuning Job
    job = client.fine_tuning.jobs.create(
        training_file=train_file.id,
        validation_file=val_file.id,
        model="gpt-4o-mini-2024-07-18",
        suffix="saham-indo-v1"
    )

    print(f"[BERHASIL] Job Created!")
    print(f"Job ID : {job.id}")
    print(f"Status : {job.status}")

    print("\n" + "="*50)
    print("PROSES BERJALAN DI SERVER OPENAI.")
    print("Cek email Anda untuk notifikasi saat training selesai.")
    print("="*50)

except Exception as e:
    print(f"\n[TERJADI ERROR]: {e}")
    print("Kemungkinan penyebab error:")
    print("1. API Key salah/tidak valid.")
    print("2. Saldo OpenAI habis (perlu top-up credit, bukan ChatGPT Plus).")
    print("3. File .jsonl belum terbuat (jalankan langkah sebelumnya).")


--- 1. Memulai Upload File Training ---
[BERHASIL] Training File ID: file-AA151R5yhbw3EiiuSTjgbe

--- 2. Memulai Upload File Validation ---
[BERHASIL] Validation File ID: file-Pzv6MwRjUMkt2nBdxMwFKW

--- 3. Membuat Job Fine-Tuning ---
[BERHASIL] Job Created!
Job ID : ftjob-HcuVpGmVDMJeLwRkKmOlXFDS
Status : validating_files

PROSES BERJALAN DI SERVER OPENAI.
Cek email Anda untuk notifikasi saat training selesai.


In [None]:
JOB_ID = "ftjob-HcuVpGmVDMJeLwRkKmOlXFDS"
print(f"Memonitor status untuk Job ID: {JOB_ID}")

while True:
    try:
        # Cek status terbaru ke OpenAI
        job_status = client.fine_tuning.jobs.retrieve(JOB_ID)
        status = job_status.status
        print(f"Status saat ini: {status}")

        if status == 'succeeded':
            fine_tuned_model_id = job_status.fine_tuned_model
            print(f"\n[SUKSES] Nama Model Baru: {fine_tuned_model_id}")
            break
        elif status in ['failed', 'cancelled']:
            print(f"\n[BERHENTI] Job status: {status}")
            break

    except APIConnectionError:
        print("Gangguan koneksi dideteksi. Mencoba kembali dalam 60 detik...")
    except Exception as e:
        print(f"Terjadi error lain: {e}")
        break

    time.sleep(60)

Memonitor status untuk Job ID: ftjob-HcuVpGmVDMJeLwRkKmOlXFDS
Status saat ini: validating_files
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Status saat ini: running
Stat

In [10]:
job_details = client.fine_tuning.jobs.retrieve(JOB_ID )
print(job_details.hyperparameters)

Hyperparameters(batch_size=5, learning_rate_multiplier=1.8, n_epochs=3)


In [None]:
# ==========================================
# 1. KONFIGURASI
# ==========================================

# Inisialisasi Client
client = OpenAI(api_key=MY_API_KEY)

# ==========================================
# 2. PERSIAPAN DATA TEST
# ==========================================
try:
    if 'test_df' not in locals():
        print("Mencoba memuat data test dari 'sentiment_test.csv'...")
        test_df = pd.read_csv("sentiment_test.csv")
    else:
        print("Menggunakan data test yang sudah ada di memori.")

    print(f"Jumlah Data Test: {len(test_df)}")
except FileNotFoundError:
    print("[ERROR] File 'sentiment_test.csv' tidak ditemukan. Pastikan Anda sudah menjalankan tahap split data.")

# ==========================================
# 3. FUNGSI EVALUASI
# ==========================================
def run_evaluation_now(client, model_id, dataframe):
    print(f"\n--- MEMULAI EVALUASI PADA MODEL: {model_id} ---")
    y_true = []
    y_pred = []

    # Loop data
    for idx, row in dataframe.iterrows():
        try:
            # Ambil teks (prioritaskan cleaned_text, fallback ke Sentence)
            text_input = row.get("cleaned_text", row.get("Sentence"))

            # Request ke OpenAI
            resp = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": "Kamu adalah model analisis sentimen saham Indonesia. Jawab hanya dengan satu label: Positive, Negative, atau Neutral."},
                    {"role": "user", "content": text_input}
                ],
                temperature=0
            )
            prediction = resp.choices[0].message.content.strip()

            # Simpan
            y_true.append(row["Sentiment"])
            y_pred.append(prediction)

            # Tampilkan progress agar tidak dikira macet
            print(f"[{idx+1}/{len(dataframe)}] Asli: {row['Sentiment']} | Prediksi: {prediction}")

        except Exception as e:
            print(f"[ERROR] Baris {idx}: {e}")
            # Jika errornya karena Model ID salah, loop akan berhenti
            if "model" in str(e) and "not found" in str(e):
                print(">>> STOP: Model ID tidak ditemukan. Periksa kembali MY_MODEL_ID Anda.")
                return

    # Tampilkan Report
    print("\n" + "="*40)
    print("HASIL AKHIR (CLASSIFICATION REPORT)")
    print("="*40)
    print(classification_report(y_true, y_pred, zero_division=0))

# ==========================================
# 4. EKSEKUSI LANGSUNG
# ==========================================
# Kita panggil fungsinya langsung tanpa IF/ELSE
if 'test_df' in locals():
    run_evaluation_now(client, MY_MODEL_ID, test_df)
else:
    print("Gagal menjalankan evaluasi karena data test tidak ditemukan.")

Menggunakan data test yang sudah ada di memori.
Jumlah Data Test: 315

--- MEMULAI EVALUASI PADA MODEL: ft:gpt-4o-mini-2024-07-18:personal:saham-indo-v1:CxlsbcSP ---
[1985/315] Asli: Negative | Prediksi: Negative
[845/315] Asli: Negative | Prediksi: Negative
[1940/315] Asli: Positive | Prediksi: Positive
[2314/315] Asli: Positive | Prediksi: Positive
[553/315] Asli: Negative | Prediksi: Negative
[2237/315] Asli: Neutral | Prediksi: Neutral
[1002/315] Asli: Positive | Prediksi: Positive
[949/315] Asli: Neutral | Prediksi: Positive
[3016/315] Asli: Negative | Prediksi: Negative
[924/315] Asli: Neutral | Prediksi: Neutral
[3004/315] Asli: Positive | Prediksi: Positive
[1791/315] Asli: Positive | Prediksi: Positive
[1508/315] Asli: Positive | Prediksi: Positive
[2505/315] Asli: Positive | Prediksi: Positive
[145/315] Asli: Positive | Prediksi: Positive
[2171/315] Asli: Positive | Prediksi: Positive
[41/315] Asli: Negative | Prediksi: Negative
[2220/315] Asli: Positive | Prediksi: Positive
