# Cbow Embeding Berita Preprocesed

In [1]:
%%capture
!pip install plotly
!pip install --upgrade gensim

## Ringkasan Alur
- Import pustaka yang dibutuhkan
- Baca dataset `hasil_preprocessing_berita.csv`
- Pembersihan teks (lowercase, hapus tanda baca/tag/digit)
- Tokenisasi dan pembuatan `corpus`
- Latih Word2Vec (CBOW) untuk embedding kata
- Hitung mean embedding per dokumen (rata-rata vektor kata)
- Bentuk DataFrame fitur `f1..f56` dan tambahkan label `spam`
- (Opsional) Simpan hasil ke CSV


In [2]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import re

from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
import plotly.graph_objects as go

import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Load dataset TF-IDF berita dari file CSV
# Pastikan file 'hasil_tfidf_berita.csv' ada di direktori kerja notebook ini

df = pd.read_csv('hasil_preprocessing_berita.csv')

In [3]:
from gensim.models import Word2Vec

In [4]:
import numpy as np

class MyTokenizer:
    def fit_transform(self, texts):
        # Tokenisasi sederhana: lowercase + split
        return [str(text).lower().split() for text in texts]

class MeanEmbeddingVectorizer:
    def __init__(self, word2vec_model):
        self.word2vec = word2vec_model
        # Perbaikan: gunakan vector_size (Gensim ≥ 4.0)
        self.dim = word2vec_model.wv.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tokenized = MyTokenizer().fit_transform(X)
        embeddings = []
        for words in X_tokenized:
            # Ambil vektor hanya untuk kata yang ada di vocab
            valid_vectors = [
                self.word2vec.wv[word] for word in words
                if word in self.word2vec.wv
            ]
            if valid_vectors:
                embeddings.append(np.mean(valid_vectors, axis=0))
            else:
                embeddings.append(np.zeros(self.dim))
        return np.array(embeddings)

    def fit_transform(self, X, y=None):
        return self.transform(X)

In [5]:
# Bangun korpus token dari teks hasil preprocessing: setiap baris -> list kata
# Gunakan kolom 'hasil_preprocessing' jika ada, jika tidak cari alternatif umum
import ast

text_col = 'hasil_preprocessing' if 'hasil_preprocessing' in df.columns else None
if text_col is None:
    possible_text_cols = ['clean', 'text', 'preprocessed', 'kalimat', 'sentence']
    text_col = next((c for c in possible_text_cols if c in df.columns), None)
if text_col is None:
    raise RuntimeError("Kolom teks tidak ditemukan. Pastikan 'hasil_preprocessing' atau kolom teks lain tersedia di CSV.")

# Parser aman: dukung format list-string seperti "['kata', 'kata2']" atau string biasa
def to_tokens(value):
    if isinstance(value, list):
        return [str(tok).lower() for tok in value if str(tok).strip()]
    if isinstance(value, str):
        s = value.strip()
        if s.startswith('[') and s.endswith(']'):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return [str(tok).lower() for tok in parsed if str(tok).strip()]
            except Exception:
                pass
        return [tok for tok in s.lower().split() if tok]
    return []

texts = df[text_col].fillna("").tolist()
corpus = [tokens for tokens in (to_tokens(v) for v in texts) if tokens]

if len(corpus) == 0:
    raise RuntimeError("Corpus kosong: kolom teks ada tetapi tidak berisi token valid.")

# Contoh token dokumen pertama
corpus[0:1]

[['kompascom',
  'alam',
  'senang',
  'kabar',
  'alami',
  'keluarga',
  'timnas',
  'indonesia',
  'hadir',
  'langsung',
  'king',
  'abdullah',
  'sports',
  'city',
  'jeddah',
  'saksi',
  'juang',
  'skuad',
  'garuda',
  'lawan',
  'timnas',
  'arab',
  'saudi',
  'timnas',
  'indonesia',
  'hadap',
  'timnas',
  'arab',
  'saudi',
  'laga',
  'perdana',
  'grup',
  'b',
  'ronde',
  'empat',
  'kualifikasi',
  'piala',
  'dunia',
  'zona',
  'asia',
  'duel',
  'timnas',
  'indonesia',
  'vs',
  'arab',
  'saudi',
  'langsung',
  'king',
  'abdullah',
  'sports',
  'city',
  'jeddah',
  'rabu',
  'kamis',
  'wib',
  'lapang',
  'suporter',
  'indonesia',
  'berbondongbondong',
  'hadir',
  'stadion',
  'dukung',
  'langsung',
  'juang',
  'timnas',
  'indonesia',
  'kecuali',
  'keluarga',
  'main',
  'sayang',
  'lapor',
  'laku',
  'senang',
  'aman',
  'keluarga',
  'main',
  'kabar',
  'istri',
  'thom',
  'haye',
  'bibeche',
  'riva',
  'hamil',
  'enam',
  'alami',
  '

## Tokenisasi dan Pelatihan Word2Vec (CBOW)
- Tokenisasi setiap baris `clean` menjadi list kata → `corpus`
- Latih model Word2Vec dengan default CBOW (`sg=0`) dan `vector_size=56`
- Hasil: embedding vektor untuk setiap kata di vocabulary


In [6]:
df.shape

(1432, 3)

In [7]:
# Latih model Word2Vec (CBOW by default: sg=0) dengan ukuran vektor 56
# Menggunakan korpus token yang dibangun dari fitur TF-IDF (nilai > 0)
model = Word2Vec(corpus, min_count=1, vector_size=56)

In [8]:
# (Opsional) contoh eksplorasi embeddings kata jika diperlukan
# model.wv.most_similar('eric')
# model.wv.most_similar_cosmul(positive=['phone', 'number'], negative=['call'])
# model.wv.doesnt_match("phone number prison cell".split())

# Simpan embeddings kata yang dilatih
filename = 'berita_embd.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [9]:
# Mean embedding per dokumen: rata-rata vektor kata dari token TF-IDF
mean_embedding_vectorizer = MeanEmbeddingVectorizer(model)
# Gabungkan token menjadi string kalimat agar tokenizer bekerja sama seperti sebelumnya
joined_docs = [" ".join(tokens) for tokens in corpus]
mean_embedded = mean_embedding_vectorizer.fit_transform(joined_docs)

In [10]:
# Simpan vektor dokumen ke kolom 'array'
df['array']=list(mean_embedded)

## Rata-rata Embedding per Dokumen
- Gunakan `MeanEmbeddingVectorizer` untuk merata-ratakan vektor kata per dokumen
- Jika dokumen tidak punya kata di vocab, isi vektor nol berdimensi 56


In [11]:
df.head(5)

Unnamed: 0,isi,hasil_preprocessing,kategori,array
0,KOMPAS.com -Pengalaman tak menyenangkan kabarn...,"['kompascom', 'alam', 'senang', 'kabar', 'alam...",Bola,"[-0.116911605, 0.23120639, 1.2962577, -0.10626..."
1,"JAKARTA, KOMPAS.com –Tiktokers Figha Lesmana m...","['jakarta', 'kompascom', 'tiktokers', 'figha',...",Megapolitan,"[-0.1359653, 0.049750797, 0.6405434, 0.0864336..."
2,KOMPAS.com -Wakil Presiden Direktur PT Toyota ...,"['kompascom', 'wakil', 'presiden', 'direktur',...",Money,"[-0.044495445, -0.056558896, 0.59772676, 0.183..."
3,"JAKARTA, KOMPAS.com- Menteri Koordinator Bidan...","['jakarta', 'kompascom', 'menteri', 'koordinat...",Nasional,"[-0.1386374, 0.016200868, 0.6541829, 0.2042382..."
4,"JAKARTA, KOMPAS.com- Menteri Pertanian (Mentan...","['jakarta', 'kompascom', 'menteri', 'tani', 't...",Nasional,"[-0.05305485, -0.104273096, 0.6120534, 0.18971..."


In [12]:
df['embedding_length'] = df['array'].str.len()

In [13]:
print(df['embedding_length'])

0       56
1       56
2       56
3       56
4       56
        ..
1427    56
1428    56
1429    56
1430    56
1431    56
Name: embedding_length, Length: 1432, dtype: int64


## Bentuk DataFrame Fitur f1..f56 dan Tambah Label
- Ekstrak setiap dimensi embedding ke kolom `f1..f56`
- Tambahkan label `spam` dari dataset asli


In [14]:
df.shape

(1432, 5)

In [15]:
num_features = len(df['array'].iloc[0])  # asumsi semua list punya panjang sama
columns = [f'f{i+1}' for i in range(num_features)]

# Inisialisasi dictionary untuk menampung data per kolom
data_dict = {col: [] for col in columns}

# Looping setiap baris di kolom 'embedding'
for embedding_list in df['array']:
    for i, value in enumerate(embedding_list):
        data_dict[f'f{i+1}'].append(value)

# Buat DataFrame dari dictionary
embedding_df = pd.DataFrame(data_dict)

print(embedding_df)

            f1        f2        f3        f4        f5        f6        f7  \
0    -0.116912  0.231206  1.296258 -0.106261  0.427243 -1.280542 -0.269893   
1    -0.135965  0.049751  0.640543  0.086434  0.306998 -0.689782  0.181733   
2    -0.044495 -0.056559  0.597727  0.183589  0.328944 -0.694667  0.088894   
3    -0.138637  0.016201  0.654183  0.204238  0.331325 -0.545207 -0.022958   
4    -0.053055 -0.104273  0.612053  0.189716  0.302850 -0.667873  0.147173   
...        ...       ...       ...       ...       ...       ...       ...   
1427 -0.081487  0.061035  0.564245  0.058049  0.247273 -0.619038  0.038862   
1428 -0.003480 -0.200948  0.591297  0.149840  0.224736 -0.580101  0.280189   
1429 -0.044428  0.060837  0.819220 -0.014274  0.285019 -0.892346 -0.039137   
1430 -0.068518 -0.059819  0.701857  0.129806  0.362797 -0.872952  0.237042   
1431 -0.130802 -0.068001  0.644659  0.133280  0.368136 -0.750800  0.164573   

            f8        f9       f10  ...       f47       f48    

In [16]:
# Gunakan label kategori dari dataset TF-IDF berita
embedding_df['kategori'] = df['kategori'].values  

## Simpan Hasil ke CSV (Opsional)
Simpan `embedding_df` ke file CSV untuk digunakan di proses selanjutnya.


In [17]:
# Simpan DataFrame fitur dokumen ke CSV (opsional)
embedding_df.to_csv('berita_doc_embeddings.csv', index=False, encoding='utf-8')
print('Disimpan ke berita_doc_embeddings.csv')


Disimpan ke berita_doc_embeddings.csv


In [18]:
embedding_df

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f48,f49,f50,f51,f52,f53,f54,f55,f56,kategori
0,-0.116912,0.231206,1.296258,-0.106261,0.427243,-1.280542,-0.269893,-0.577200,0.091246,-1.011330,...,-0.272076,-0.513972,-0.162168,0.354559,1.550771,-0.406580,-0.981929,1.131881,-0.449377,Bola
1,-0.135965,0.049751,0.640543,0.086434,0.306998,-0.689782,0.181733,-0.821080,-0.127204,-0.494855,...,-0.000499,-0.165605,0.119591,0.296642,0.805718,-0.408663,-0.223718,0.484952,-0.515359,Megapolitan
2,-0.044495,-0.056559,0.597727,0.183589,0.328944,-0.694667,0.088894,-0.763929,-0.229885,-0.531465,...,0.192950,-0.307309,0.017221,0.238884,0.911903,-0.056456,-0.182219,0.461012,-0.328963,Money
3,-0.138637,0.016201,0.654183,0.204238,0.331325,-0.545207,-0.022958,-0.877949,-0.263429,-0.609655,...,0.048318,-0.156326,0.200209,0.197752,0.910862,-0.156614,-0.266985,0.575988,-0.514337,Nasional
4,-0.053055,-0.104273,0.612053,0.189716,0.302850,-0.667873,0.147173,-0.782518,-0.246994,-0.460898,...,0.183645,-0.402595,-0.002016,0.254439,1.012126,0.031440,-0.146217,0.457434,-0.239330,Nasional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,-0.081487,0.061035,0.564245,0.058049,0.247273,-0.619038,0.038862,-0.579501,-0.097029,-0.461802,...,0.020238,-0.191979,0.051564,0.202229,0.722982,-0.275617,-0.269077,0.446486,-0.360291,Entertainment
1428,-0.003480,-0.200948,0.591297,0.149840,0.224736,-0.580101,0.280189,-0.819842,-0.232213,-0.326542,...,0.145354,-0.441266,0.046999,0.290672,1.105285,0.121250,-0.121748,0.423123,-0.115669,Otomotif
1429,-0.044428,0.060837,0.819220,-0.014274,0.285019,-0.892346,-0.039137,-0.599804,-0.034336,-0.616169,...,-0.009217,-0.378604,-0.113331,0.318914,1.150121,-0.181005,-0.497643,0.640900,-0.263370,Tekno
1430,-0.068518,-0.059819,0.701857,0.129806,0.362797,-0.872952,0.237042,-0.917694,-0.213516,-0.599835,...,0.161272,-0.326153,0.081646,0.329683,0.985535,-0.274722,-0.189847,0.535395,-0.488574,Otomotif


In [19]:
embedding_df.shape

(1432, 57)