# CBOW Berita

In [1]:
import sys
!{sys.executable} -m pip install gensim



In [2]:
!{sys.executable} -m pip install pandas



In [3]:
import pandas as pd
import numpy as np
import ast # Library untuk mengubah string menjadi list
from gensim.models import Word2Vec
import logging

# --- 1. Persiapan Corpus ---
print("--- TAHAP 1: MEMPERSIAPKAN CORPUS ---")
# Load kembali data Anda (atau lanjutkan dari DataFrame yang sudah ada)
df = pd.read_csv('hasil_preprocessing_berita.csv')

# --- Konversi kolom 'hasil_preprocessing' dari string ke list ---
# Ini adalah langkah penting!
# ast.literal_eval akan membaca string "['a', 'b']" dan mengubahnya menjadi list ['a', 'b']
df['tokens'] = df['hasil_preprocessing'].apply(ast.literal_eval)

# Buat corpus yang siap untuk dilatih
corpus = df['tokens'].tolist()


print("Proses persiapan corpus selesai.")
print("Berikut adalah contoh 1 dokumen (berita) yang sudah diubah menjadi daftar token:")
print(corpus[0])

--- TAHAP 1: MEMPERSIAPKAN CORPUS ---


Proses persiapan corpus selesai.
Berikut adalah contoh 1 dokumen (berita) yang sudah diubah menjadi daftar token:
['sidoarjobangsaonlinecom', 'proses', 'cari', 'korban', 'santri', 'timbun', 'runtuh', 'musala', 'ambruk', 'pondok', 'pesantren', 'ponpes', 'alkhoziny', 'budur', 'sidoarjo', 'rabu', 'siang', 'santri', 'enggan', 'nama', 'aku', 'santri', 'cor', 'bangun', 'hukum', 'santri', 'ikut', 'giat', 'ambruk', 'pas', 'salat', 'jemaah', 'imam', 'selamat', 'temanteman', 'timpa', 'santri', 'rabu', 'tim', 'sar', 'gabung', 'jibaku', 'sisir', 'puingpuing', 'bangun', 'lantai', 'total', 'ratus', 'santri', 'hasil', 'evakuasi', 'cari', 'musibah', 'milu', 'selip', 'kisah', 'hari', 'santri', 'pondok', 'salah', 'satu', 'biasa', 'beri', 'hukum', 'ikut', 'giat', 'pesantren', 'hukum', 'bantu', 'cor', 'bangun', 'santri', 'tahu', 'bolos', 'hukum', 'bantu', 'ngecor', 'tukang', 'santri', 'wajib', 'suruh', 'bantu', 'kena', 'hukum', 'santri', 'enam', 'mondok', 'untung', 'musala', 'ambruk', 'lokasi', 'pondok', 

In [4]:
# --- 2. Melatih Model Word2Vec (dengan Proses Terlihat) ---

print("--- TAHAP 2: MELATIH MODEL WORD2VEC (CBOW) ---")

# Mengaktifkan logging untuk melihat proses training dari Gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

embedding_dim = 150
print(f"Parameter: Dimensi vektor = {embedding_dim}, Arsitektur = CBOW")
print("Gensim akan menampilkan log proses training di bawah ini:")

model_cbow = Word2Vec(
    sentences=corpus,
    vector_size=embedding_dim,
    window=5,
    min_count=2,
    sg=0,
    workers=4
)

print("\nPelatihan model selesai!")
print("\n--- Melihat Hasil Pelatihan Model ---")
# Mengetahui ukuran kosakata yang berhasil dipelajari model
vocab_size = len(model_cbow.wv.index_to_key)
print(f"Model berhasil mempelajari {vocab_size} kata unik.")

# Melihat kata-kata yang paling mirip secara semantik dengan kata tertentu
# Ini membuktikan model sudah belajar konteks
try:
    print("\nContoh kata yang paling mirip dengan 'polisi':")
    print(model_cbow.wv.most_similar('polisi', topn=5))

    print("\nContoh kata yang paling mirip dengan 'surabaya':")
    print(model_cbow.wv.most_similar('surabaya', topn=5))
except KeyError as e:
    print(f"\nKata {e} tidak ditemukan di vocabulary (mungkin karena jarang muncul).")

print("\n" + "="*50 + "\n")

2025-10-02 10:34:14,725 : INFO : collecting all words and their counts


2025-10-02 10:34:14,725 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2025-10-02 10:34:14,757 : INFO : collected 18096 word types from a corpus of 207077 raw words and 925 sentences


2025-10-02 10:34:14,759 : INFO : Creating a fresh vocabulary


2025-10-02 10:34:14,795 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 10268 unique words (56.74% of original 18096, drops 7828)', 'datetime': '2025-10-02T10:34:14.795250', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}


2025-10-02 10:34:14,796 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 199249 word corpus (96.22% of original 207077, drops 7828)', 'datetime': '2025-10-02T10:34:14.796250', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}


2025-10-02 10:34:14,846 : INFO : deleting the raw counts dictionary of 18096 items


2025-10-02 10:34:14,847 : INFO : sample=0.001 downsamples 16 most-common words


2025-10-02 10:34:14,848 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 196640.93131086108 word corpus (98.7%% of prior 199249)', 'datetime': '2025-10-02T10:34:14.848758', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}


--- TAHAP 2: MELATIH MODEL WORD2VEC (CBOW) ---
Parameter: Dimensi vektor = 150, Arsitektur = CBOW
Gensim akan menampilkan log proses training di bawah ini:


2025-10-02 10:34:14,931 : INFO : estimated required memory for 10268 words and 150 dimensions: 17455600 bytes


2025-10-02 10:34:14,932 : INFO : resetting layer weights


2025-10-02 10:34:14,942 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-02T10:34:14.942546', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'build_vocab'}


2025-10-02 10:34:14,943 : INFO : Word2Vec lifecycle event {'msg': 'training model with 4 workers on 10268 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-02T10:34:14.943542', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'train'}


2025-10-02 10:34:15,139 : INFO : EPOCH 0: training on 207077 raw words (196648 effective words) took 0.2s, 1019865 effective words/s


2025-10-02 10:34:15,299 : INFO : EPOCH 1: training on 207077 raw words (196662 effective words) took 0.2s, 1268102 effective words/s


2025-10-02 10:34:15,454 : INFO : EPOCH 2: training on 207077 raw words (196608 effective words) took 0.2s, 1297739 effective words/s


2025-10-02 10:34:15,616 : INFO : EPOCH 3: training on 207077 raw words (196688 effective words) took 0.2s, 1242379 effective words/s


2025-10-02 10:34:15,774 : INFO : EPOCH 4: training on 207077 raw words (196725 effective words) took 0.2s, 1283448 effective words/s


2025-10-02 10:34:15,774 : INFO : Word2Vec lifecycle event {'msg': 'training on 1035385 raw words (983331 effective words) took 0.8s, 1184470 effective words/s', 'datetime': '2025-10-02T10:34:15.774951', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'train'}


2025-10-02 10:34:15,775 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=10268, vector_size=150, alpha=0.025>', 'datetime': '2025-10-02T10:34:15.775961', 'gensim': '4.3.3', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}



Pelatihan model selesai!

--- Melihat Hasil Pelatihan Model ---
Model berhasil mempelajari 10268 kata unik.

Contoh kata yang paling mirip dengan 'polisi':
[('tindak', 0.9989293217658997), ('lari', 0.9984794855117798), ('keluh', 0.9983461499214172), ('polsek', 0.9983104467391968), ('tahap', 0.9979819059371948)]

Contoh kata yang paling mirip dengan 'surabaya':
[('mojokerto', 0.9912481307983398), ('pacet', 0.9809286594390869), ('pn', 0.9808470010757446), ('jombang', 0.9805688858032227), ('wakil', 0.9728435277938843)]




In [5]:
# --- 3. Membedah Proses Agregasi Vektor Dokumen ---
print("--- TAHAP 3: MEMBEDAH PROSES AGREGRASI MENJADI VEKTOR DOKUMEN ---")

def create_document_vector(doc, model, num_features):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if not word_vectors:
        return np.zeros(num_features)
    return np.mean(word_vectors, axis=0)

contoh_berita = corpus[0]
print("Kita akan menganalisis berita pertama:")
print(f"Isi berita (token): {contoh_berita}")

print("\nVektor untuk 3 kata pertama dalam berita:")
for i, word in enumerate(contoh_berita[:3]):
    if word in model_cbow.wv:
        print(f"  - Vektor kata '{word}': {model_cbow.wv[word][:5]}... (ditampilkan 5 dimensi pertama)")
    else:
        print(f"  - Kata '{word}' tidak ada di vocabulary model.")

vektor_berita_contoh = create_document_vector(contoh_berita, model_cbow, embedding_dim)
print("\nHasil vektor dokumen (setelah dirata-ratakan):")
print(f"{vektor_berita_contoh[:10]}... (ditampilkan 10 dimensi pertama)")
print(f"Panjang vektor: {len(vektor_berita_contoh)} dimensi (sesuai yang kita tentukan).")

--- TAHAP 3: MEMBEDAH PROSES AGREGRASI MENJADI VEKTOR DOKUMEN ---
Kita akan menganalisis berita pertama:
Isi berita (token): ['sidoarjobangsaonlinecom', 'proses', 'cari', 'korban', 'santri', 'timbun', 'runtuh', 'musala', 'ambruk', 'pondok', 'pesantren', 'ponpes', 'alkhoziny', 'budur', 'sidoarjo', 'rabu', 'siang', 'santri', 'enggan', 'nama', 'aku', 'santri', 'cor', 'bangun', 'hukum', 'santri', 'ikut', 'giat', 'ambruk', 'pas', 'salat', 'jemaah', 'imam', 'selamat', 'temanteman', 'timpa', 'santri', 'rabu', 'tim', 'sar', 'gabung', 'jibaku', 'sisir', 'puingpuing', 'bangun', 'lantai', 'total', 'ratus', 'santri', 'hasil', 'evakuasi', 'cari', 'musibah', 'milu', 'selip', 'kisah', 'hari', 'santri', 'pondok', 'salah', 'satu', 'biasa', 'beri', 'hukum', 'ikut', 'giat', 'pesantren', 'hukum', 'bantu', 'cor', 'bangun', 'santri', 'tahu', 'bolos', 'hukum', 'bantu', 'ngecor', 'tukang', 'santri', 'wajib', 'suruh', 'bantu', 'kena', 'hukum', 'santri', 'enam', 'mondok', 'untung', 'musala', 'ambruk', 'lokasi',

In [6]:
# --- 4. Membuat DataFrame Akhir ---
print("--- TAHAP 4: MEMBUAT DATAFRAME AKHIR ---")
doc_vectors = [create_document_vector(doc, model_cbow, embedding_dim) for doc in corpus]
cbow_df = pd.DataFrame(doc_vectors, columns=[f'dim_{i+1}' for i in range(embedding_dim)])
cbow_df['kategori'] = df['kategori'].values

print("Proses pembuatan DataFrame selesai.")
print("Berikut adalah contoh hasil akhirnya:")
print(cbow_df.head())

--- TAHAP 4: MEMBUAT DATAFRAME AKHIR ---


Proses pembuatan DataFrame selesai.
Berikut adalah contoh hasil akhirnya:
      dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0  0.069216 -0.086883 -0.211147 -0.273677 -0.079135 -0.503759  0.105948   
1  0.238639 -0.226196 -0.284154 -0.117963  0.003578 -0.368139  0.128672   
2  0.089199 -0.170814 -0.168784 -0.214527 -0.003419 -0.534771  0.114673   
3  0.277909 -0.224949 -0.320442 -0.066012  0.000749 -0.262456  0.083818   
4  0.201823 -0.180752 -0.297515 -0.148686 -0.025942 -0.332840  0.114246   

      dim_8     dim_9    dim_10  ...   dim_142   dim_143   dim_144   dim_145  \
0  0.278947  0.053410  0.331715  ...  0.306150  0.417316  0.243297  0.462066   
1  0.096986  0.029187  0.308865  ...  0.176685  0.320070  0.144839  0.334102   
2  0.125485 -0.060687  0.348521  ...  0.329579  0.411935  0.290931  0.445818   
3  0.167154  0.104912  0.233309  ...  0.081702  0.230552  0.116026  0.306610   
4  0.163369  0.076258  0.266791  ...  0.160711  0.298401  0.152861  0.32440