<a href="https://colab.research.google.com/github/revyellans/UAPML/blob/main/UAPML_pretrained_(BERT)_revy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import re
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

from torch.utils.data import Dataset, DataLoader

In [19]:
# ======================================================
# 1. LOAD DATA
# ======================================================

df = pd.read_csv("name_gender_dataset.csv")

# ======================================================
# 2. NORMALISASI KOLOM
# ======================================================

df.columns = df.columns.str.lower()
df = df.rename(columns={
    'nama': 'name',
    'jenis_kelamin': 'gender',
    'jk': 'gender'
})

df = df.dropna(subset=['name', 'gender'])
print("Data awal:", len(df))

Data awal: 147269


In [20]:
# ======================================================
# 3. PREPROCESSING NAMA (AMAN)
# ======================================================

def preprocess_name(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df['name'] = df['name'].apply(preprocess_name)
df = df[df['name'].str.len() >= 2]

In [21]:
# ======================================================
# 4. NORMALISASI GENDER (FLEKSIBEL)
# ======================================================

def normalize_gender(label):
    label = str(label).lower().strip()

    if (
        label.startswith('l') or
        label.startswith('m') or
        'male' in label or
        'pria' in label or
        'laki' in label
    ):
        return 'male'

    elif (
        label.startswith('p') or
        label.startswith('f') or
        'female' in label or
        'wanita' in label or
        'perempuan' in label
    ):
        return 'female'

    else:
        return None

df['gender'] = df['gender'].apply(normalize_gender)
df = df.dropna(subset=['gender'])

print("Data setelah preprocessing:", len(df))
print(df['gender'].value_counts())

Data setelah preprocessing: 147251
gender
female    89743
male      57508
Name: count, dtype: int64


In [22]:

# ======================================================
# 5. LABEL ENCODING
# ======================================================

le = LabelEncoder()
df['label'] = le.fit_transform(df['gender'])  # male/female â†’ 0/1

# ======================================================
# 6. SPLIT DATA
# ======================================================

X_train, X_test, y_train, y_test = train_test_split(
    df['name'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [23]:
# ======================================================
# 7. TOKENIZER BERT (PRETRAINED)
# ======================================================

MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def tokenize(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=16,
        return_tensors="pt"
    )

train_encodings = tokenize(X_train)
test_encodings = tokenize(X_test)

In [24]:
# ======================================================
# 8. DATASET TORCH
# ======================================================

class GenderDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GenderDataset(train_encodings, y_train)
test_dataset = GenderDataset(test_encodings, y_test)

In [26]:
# ======================================================
# 9. LOAD MODEL BERT
# ======================================================

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# ======================================================
# 10. FREEZE BERT (SUPER CEPAT)
# ======================================================

for param in model.bert.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:

# ======================================================
# 11. TRAINING ARGUMENTS (FAST)
# ======================================================

training_args = TrainingArguments(
    output_dir="./bert_output",
    eval_strategy="no",
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# ======================================================
# 12. TRAINER
# ======================================================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()



Step,Training Loss
50,0.6833
100,0.6677
150,0.6707
200,0.6653
250,0.66
300,0.6667
350,0.673
400,0.6743
450,0.666
500,0.6743


TrainOutput(global_step=7364, training_loss=0.6393364400464773, metrics={'train_runtime': 10909.4835, 'train_samples_per_second': 21.596, 'train_steps_per_second': 0.675, 'total_flos': 1452866358816000.0, 'train_loss': 0.6393364400464773, 'epoch': 2.0})

In [29]:
# ======================================================
# 13. EVALUASI MANUAL (ANTI BUG)
# ======================================================

model.eval()
test_loader = DataLoader(test_dataset, batch_size=32)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=le.classes_))


Accuracy: 0.6234423279345354
              precision    recall  f1-score   support

      female       0.62      0.99      0.76     17949
        male       0.80      0.05      0.09     11502

    accuracy                           0.62     29451
   macro avg       0.71      0.52      0.43     29451
weighted avg       0.69      0.62      0.50     29451



In [35]:
# ======================================================
# 14. SIMPAN MODEL & TOKENIZER
# ======================================================

model.save_pretrained("gender_name_bert_model")
tokenizer.save_pretrained("gender_name_bert_model")

print("Model & tokenizer berhasil disimpan")

Model & tokenizer berhasil disimpan


In [34]:
# ======================================================
# 15. CONTOH PREDIKSI
# ======================================================

def predict_gender(name):
    name = preprocess_name(name)
    inputs = tokenizer(name, return_tensors="pt", truncation=True, max_length=16)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return le.inverse_transform([pred])[0]

contoh = ["aisyah", "william", "putri", "bara"]

for n in contoh:
    print(n, "->", predict_gender(n))

aisyah -> female
william -> female
putri -> female
bara -> female
