<a href="https://colab.research.google.com/github/revyellans/UAPML/blob/main/UAPML_non_pretrained_revy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("name_gender_dataset.csv")

# Normalisasi nama kolom
df.columns = df.columns.str.lower()
df = df.rename(columns={
    'nama': 'name',
    'jenis_kelamin': 'gender',
    'jk': 'gender'
})

print("Data awal:", len(df))


Data awal: 147269


In [3]:
df = df.dropna(subset=['name', 'gender'])
print("Setelah drop NA:", len(df))

Setelah drop NA: 147269


In [4]:
# =========================================
# 3. PREPROCESSING TEKS NAMA
# =========================================

def preprocess_name(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)   # ganti simbol → spasi
    text = re.sub(r'\s+', ' ', text)           # rapikan spasi
    return text


df['name'] = df['name'].apply(preprocess_name)

# Hapus hanya jika BENAR-BENAR kosong
df = df[df['name'].str.len() >= 2]

print("Setelah preprocessing nama:", len(df))


Setelah preprocessing nama: 147251


In [5]:
# =========================================
# 4. NORMALISASI LABEL GENDER
# =========================================

def normalize_gender(label):
    label = str(label).lower().strip()

    if label.startswith('l') or label.startswith('m'):
        return 'male'
    elif label.startswith('p') or label.startswith('f'):
        return 'female'
    else:
        return None


In [6]:
# =========================================
# 5. ENCODING LABEL
# =========================================

le = LabelEncoder()
y = le.fit_transform(df['gender'])  # male=1, female=0 (atau sebaliknya)


In [7]:
# =========================================
# 6. TOKENISASI TEKS
# =========================================

VOCAB_SIZE = 100
MAX_LEN = 20

tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(df['name'])

X_seq = tokenizer.texts_to_sequences(df['name'])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, padding='post')

In [8]:
print("Jumlah data setelah preprocessing:", len(df))
print(df['gender'].value_counts())

Jumlah data setelah preprocessing: 147251
gender
F    89743
M    57508
Name: count, dtype: int64


In [9]:
# =========================================
# 7. SPLIT DATA
# =========================================

X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [10]:
# =========================================
# 8. MODEL RNN (NON-PRETRAINED)
# =========================================

model = Sequential([
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=32,
        input_length=MAX_LEN
    ),
    SimpleRNN(
        64,
        activation='tanh'
    ),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()



In [11]:
# =========================================
# 9. TRAINING
# =========================================

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 8ms/step - accuracy: 0.6967 - loss: 0.5789 - val_accuracy: 0.7684 - val_loss: 0.4942
Epoch 2/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.7626 - loss: 0.4998 - val_accuracy: 0.7854 - val_loss: 0.4762
Epoch 3/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 7ms/step - accuracy: 0.7761 - loss: 0.4807 - val_accuracy: 0.7845 - val_loss: 0.4623
Epoch 4/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.7781 - loss: 0.4728 - val_accuracy: 0.7921 - val_loss: 0.4562
Epoch 5/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.7855 - loss: 0.4640 - val_accuracy: 0.7943 - val_loss: 0.4524
Epoch 6/30
[1m3314/3314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.7891 - loss: 0.4569 - val_accuracy: 0.7963 - val_loss: 0.4474
Epoch 7/30

<keras.src.callbacks.history.History at 0x7968eaa4f470>

In [15]:
# =========================================
# 10. EVALUASI
# =========================================

loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)

[1m921/921[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7925 - loss: 0.4490
Test Accuracy: 0.7945061326026917


In [16]:
# =========================================
# 11. SIMPAN MODEL & TOKENIZER
# =========================================

model.save("gender_name_rnn_model.h5")

import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model & tokenizer berhasil disimpan")



Model & tokenizer berhasil disimpan


In [17]:
# =========================================
# 12. CONTOH PREDIKSI
# =========================================

def predict_gender(nama):
    nama = preprocess_name(nama)
    seq = tokenizer.texts_to_sequences([nama])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
    pred = model.predict(pad)[0][0]
    return "male" if pred >= 0.5 else "female"

contoh = ["aisyah", "fathur", "putri", "william"]

for n in contoh:
    print(n, "->", predict_gender(n))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
aisyah -> female
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
fathur -> male
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
putri -> female
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
william -> male
