In [3]:
# =====================================================
# CNN TEXT CLASSIFICATION — INDONESIAN NEWS DATASET
# =====================================================

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# =====================================================
# 1. LOAD DATASET SAFELY
# =====================================================

file_path = "/content/data.csv"   # 📝 Change to your dataset path

try:
    df = pd.read_csv(file_path, engine='python', on_bad_lines='skip', encoding='utf-8')
except:
    df = pd.read_csv(file_path, engine='python', on_bad_lines='skip', encoding='latin1')

print("✅ Dataset loaded successfully!")
print("Columns:", list(df.columns))
print(df.head())

# =====================================================
# 2. RENAME COLUMNS MANUALLY
# =====================================================
# 👉 Update these names based on your dataset
# Example: if your file has columns ['judul_berita', 'kategori']
# then set: text_col = 'judul_berita', label_col = 'kategori'

text_col = 'title'   # <-- change this to your text column
label_col = 'source'      # <-- change this to your label column

df = df.rename(columns={text_col: 'text', label_col: 'label'})
df = df[['text', 'label']].dropna().reset_index(drop=True)

print("\n✅ Columns renamed:")
print(df.head())

# =====================================================
# 3. CLEAN & PREPROCESS TEXT
# =====================================================

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove multiple spaces
    return text

df['text'] = df['text'].apply(clean_text)
print("\n🧹 Cleaned text sample:")
print(df.sample(3))

# =====================================================
# 4. ENCODE LABELS
# =====================================================

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)

print("\n📊 Label Summary:")
for label, idx in zip(label_encoder.classes_, range(num_classes)):
    count = len(df[df['label_encoded'] == idx])
    print(f"{label:20s} -> {count} samples")

# =====================================================
# 5. TRAIN-TEST SPLIT
# =====================================================

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['label_encoded']
)

# =====================================================
# 6. TOKENIZATION & PADDING
# =====================================================

max_words = 10000   # Vocabulary size
max_len = 200       # Sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

print("\n✅ Tokenization complete!")
print(f"Train shape: {X_train_pad.shape}, Test shape: {X_test_pad.shape}")

# =====================================================
# 7. BUILD CNN MODEL
# =====================================================

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# =====================================================
# 8. TRAIN MODEL
# =====================================================

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train_cat,
    epochs=6,
    batch_size=64,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stop]
)

# =====================================================
# 9. EVALUATE MODEL
# =====================================================

loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=1)
print(f"\n🎯 Test Accuracy: {accuracy * 100:.2f}%")

# =====================================================
# 10. TEST A SAMPLE SENTENCE
# =====================================================

sample_text = ["Jokowi meresmikan proyek pembangunan di Jakarta"]
seq = tokenizer.texts_to_sequences(sample_text)
pad = pad_sequences(seq, maxlen=max_len)
pred = model.predict(pad)
pred_label = label_encoder.inverse_transform([np.argmax(pred)])

print(f"\n📰 Sample Prediction: {sample_text[0]}")
print(f"Predicted Label: {pred_label[0]}")

✅ Dataset loaded successfully!
Columns: ['id', 'source', 'title', 'image', 'url', 'content', 'date', 'embedding', 'created_at', 'updated_at', 'summary']
   id source                                              title  \
0  83  tempo  Depo Plumpang Terbakar, Anggota DPR Minta Pert...   
1  84  tempo  Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...   
2  85  tempo  HNW Mendukung Jamaah Umroh First Travel Dapatk...   
3  86  tempo  Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...   
4  87  tempo  Bamsoet Ajak Komunitas Otomotif Kembangkan Per...   

                                               image  \
0  https://statik.tempo.co/data/2023/03/04/id_118...   
1  https://statik.tempo.co/data/2023/03/04/id_118...   
2  https://statik.tempo.co/data/2023/03/04/id_118...   
3  https://statik.tempo.co/data/2023/03/04/id_118...   
4  https://statik.tempo.co/data/2023/03/04/id_118...   

                                                 url  \
0  https://nasional.tempo.co/read/1698528/depo-pl..



Epoch 1/6
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 277ms/step - accuracy: 0.3523 - loss: 1.5695 - val_accuracy: 0.5909 - val_loss: 1.3711
Epoch 2/6
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 187ms/step - accuracy: 0.6002 - loss: 1.2479 - val_accuracy: 0.5909 - val_loss: 1.1472
Epoch 3/6
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 213ms/step - accuracy: 0.6137 - loss: 1.0527 - val_accuracy: 0.5909 - val_loss: 1.1971
Epoch 4/6
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 311ms/step - accuracy: 0.5925 - loss: 1.0407 - val_accuracy: 0.5909 - val_loss: 1.1578
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6301 - loss: 1.0316

🎯 Test Accuracy: 60.91%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step

📰 Sample Prediction: Jokowi meresmikan proyek pembangunan di Jakarta
Predicted Label: tempo
