In [None]:
# Cell 1: Hubungkan Gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cell 2: Unzip Data
import os

# ==========================================================
# !!! GANTI JALUR DI BAWAH INI !!!
# Ganti dengan 'path' yang baru kamu salin dari Gdrive-mu
zip_path = '/content/drive/MyDrive/iris kaggle clash/kaggle-clash-4.zip'
# ==========================================================

extract_dir = "/content/dataset/"

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)
!unzip -q "{zip_path}" -d "{extract_dir}"

# Definisikan path global yang akan kita pakai
train_csv_path = os.path.join(extract_dir, "train.csv")
test_csv_path = os.path.join(extract_dir, "test.csv")
images_dir = os.path.join(extract_dir, "images")

print(f"Data siap di {extract_dir}")
print(f"Folder images ada di: {images_dir}")

Data siap di /content/dataset/
Folder images ada di: /content/dataset/images


In [None]:
# Cell 3: Instalasi Library
!pip install transformers tensorflow pillow scikit-learn tqdm



In [None]:
# Cell 4: Import Semua Library
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import tensorflow as tf

# Untuk Data Splitting
from sklearn.model_selection import train_test_split

# Untuk Model CLIP (Ekstraksi Fitur)
from transformers import TFCLIPModel, CLIPProcessor
from PIL import Image

# Untuk Model Kepala (Klasifikasi)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Untuk Evaluasi
from sklearn.metrics import classification_report

In [None]:
# Cell 5: Muat Data & Split
print("--- Memuat & Membagi Data ---")
df_train = pd.read_csv(train_csv_path)
df_train['sentence'] = df_train['sentence'].fillna('') # Bersihkan teks kosong

# Bagi dataframe utuh
train_df, val_df = train_test_split(df_train,
                                    test_size=0.2,       # 20% validasi
                                    random_state=42,     # Konsistensi
                                    stratify=df_train['label']) # Wajib untuk data imbalanced

# Siapkan label array (target)
y_train_array = np.array(train_df['label'])
y_val_array = np.array(val_df['label'])

print(f"Data latih (train_df): {len(train_df)} baris")
print(f"Data validasi (val_df): {len(val_df)} baris")

--- Memuat & Membagi Data ---
Data latih (train_df): 444 baris
Data validasi (val_df): 111 baris


In [None]:
# Cell 6: Muat Model CLIP & Fungsi Ekstraksi
print("--- Memuat Model CLIP & Fungsi ---")
clip_model_name = "openai/clip-vit-base-patch32"

# Processor akan menangani tokenisasi teks DAN preprocessing gambar
processor = CLIPProcessor.from_pretrained(clip_model_name)

# Muat model utamanya
clip_model = TFCLIPModel.from_pretrained(clip_model_name, use_safetensors=False)

# Bekukan model CLIP (Feature Extraction)
clip_model.trainable = False

# Definisikan fungsi ekstraksi (akan kita pakai 3x)
def extract_clip_features(df):
    text_features_list = []
    image_features_list = []

    # Tampilkan progress bar dengan tqdm
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Ekstraksi CLIP"):
        text = row['sentence']
        image_path = os.path.join(images_dir, row['image_path'])

        try:
            image = Image.open(image_path)
        except Exception:
            # Jika gambar rusak, buat gambar hitam
            image = Image.new('RGB', (224, 224))

        # Prosesor CLIP menangani teks & gambar
        inputs = processor(text=[text], images=image, return_tensors="tf", padding=True, truncation=True, max_length=77)

        # Dapatkan fitur
        outputs = clip_model(inputs)
        text_features_list.append(outputs.text_embeds)
        image_features_list.append(outputs.image_embeds)

    return np.vstack(text_features_list), np.vstack(image_features_list)

print("Model CLIP dan fungsi 'extract_clip_features' siap.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


--- Memuat Model CLIP & Fungsi ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/606M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Model CLIP dan fungsi 'extract_clip_features' siap.


In [None]:
# Cell 7: Ekstraksi Fitur (Latih & Validasi)
print("Memulai ekstraksi fitur data LATIH (train_df)...")
text_features_train, image_features_train = extract_clip_features(train_df)

print("\nMemulai ekstraksi fitur data VALIDASI (val_df)...")
text_features_val, image_features_val = extract_clip_features(val_df)

print("Ekstraksi Fitur Latih & Validasi Selesai.")

Memulai ekstraksi fitur data LATIH (train_df)...


Ekstraksi CLIP:   0%|          | 0/444 [00:00<?, ?it/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.



Memulai ekstraksi fitur data VALIDASI (val_df)...


Ekstraksi CLIP:   0%|          | 0/111 [00:00<?, ?it/s]

Ekstraksi Fitur Latih & Validasi Selesai.


In [None]:
# Cell 8: Rekayasa Fitur (Membuat Fitur Interaksi)
print("--- Membuat Fitur Super (Interaksi) ---")

# Buat Fitur Interaksi Latih
interaction_features_train = text_features_train * image_features_train
X_train_final = np.concatenate([
    text_features_train,
    image_features_train,
    interaction_features_train
], axis=1)

# Buat Fitur Interaksi Validasi
interaction_features_val = text_features_val * image_features_val
X_val_final = np.concatenate([
    text_features_val,
    image_features_val,
    interaction_features_val
], axis=1)

print(f"Bentuk Fitur Super Latih: {X_train_final.shape}")
print(f"Bentuk Fitur Super Validasi: {X_val_final.shape}")

--- Membuat Fitur Super (Interaksi) ---
Bentuk Fitur Super Latih: (444, 1536)
Bentuk Fitur Super Validasi: (111, 1536)


In [None]:
# Cell 9: Latih Model "Kepala" Terbaik
print("--- Melatih Kepala Model Terbaik (LogReg C=0.01) ---")

# Buat pipeline (Scaler + Model)
# C=0.01 adalah nilai anti-overfitting terkuat yang kita temukan
clf_head_best = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(class_weight='balanced', C=0.01, max_iter=1000, random_state=42))
])

# Latih model di "Fitur Super"
clf_head_best.fit(X_train_final, y_train_array)

print("Model 'clf_head_best' berhasil dilatih!")

--- Melatih Kepala Model Terbaik (LogReg C=0.01) ---
Model 'clf_head_best' berhasil dilatih!


In [None]:
# Cell 10: Evaluasi di Data Validasi
print("--- Mengevaluasi Model di Data Validasi ---")
y_preds_val = clf_head_best.predict(X_val_final)

print("\nLaporan Klasifikasi (di Data Validasi):")
print(classification_report(y_val_array, y_preds_val, target_names=['Non-Offensive (0)', 'Offensive (1)']))

--- Mengevaluasi Model di Data Validasi ---

Laporan Klasifikasi (di Data Validasi):
                   precision    recall  f1-score   support

Non-Offensive (0)       0.69      0.71      0.70        66
    Offensive (1)       0.56      0.53      0.55        45

         accuracy                           0.64       111
        macro avg       0.62      0.62      0.62       111
     weighted avg       0.64      0.64      0.64       111



In [None]:
# Cell 11: Persiapan Data Tes
print("--- Mempersiapkan Data Tes ---")

# 1. Muat data test
print("Memuat test.csv...")
df_test = pd.read_csv(test_csv_path)
df_test['sentence'] = df_test['sentence'].fillna('')

# 2. Ekstraksi Fitur Tes
print("Mengekstrak fitur data TEST (df_test)...")
text_features_test, image_features_test = extract_clip_features(df_test)

# 3. Rekayasa Fitur Tes
print("Membuat Fitur Super untuk data TEST...")
interaction_features_test = text_features_test * image_features_test
X_test_final_submission = np.concatenate([
    text_features_test,
    image_features_test,
    interaction_features_test
], axis=1)

print(f"Bentuk Fitur Super Tes: {X_test_final_submission.shape}")
print("Data tes siap untuk diprediksi.")

--- Mempersiapkan Data Tes ---
Memuat test.csv...
Mengekstrak fitur data TEST (df_test)...


Ekstraksi CLIP:   0%|          | 0/185 [00:00<?, ?it/s]

Membuat Fitur Super untuk data TEST...
Bentuk Fitur Super Tes: (185, 1536)
Data tes siap untuk diprediksi.


In [None]:
# Cell 12: Buat File Submission
print("--- Membuat File Submission ---")

# Prediksi di data tes menggunakan model terbaik kita
final_predictions = clf_head_best.predict(X_test_final_submission)

# Buat DataFrame
submission = pd.DataFrame({
    'image_path': df_test['image_path'],
    'label': final_predictions
})

# Simpan ke CSV
file_submission = 'submission_MODEL_TERBAIK_0.60.csv'
submission.to_csv(file_submission, index=False)

print(f"\nFile submission FINAL berhasil dibuat: {file_submission}")
print("Selesai! File ini siap untuk dikirim.")
print("\nContoh isi file submission:")
print(submission.head())