# Cbow Embeding Pta Manajemen Preprocesed

In [33]:
%%capture
!pip install plotly
!pip install --upgrade gensim

## Ringkasan Alur
- Import pustaka yang dibutuhkan
- Baca dataset `hasil_preprocessing_pta_manajemen.csv`
- Pembersihan teks (lowercase, hapus tanda baca/tag/digit)
- Tokenisasi dan pembuatan `corpus`
- Latih Word2Vec (CBOW) untuk embedding kata
- Hitung mean embedding per dokumen (rata-rata vektor kata)
- Bentuk DataFrame fitur `f1..f56` dan tambahkan label `spam`
- (Opsional) Simpan hasil ke CSV


In [34]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import re

from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
import plotly.graph_objects as go

import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Load dataset TF-IDF berita dari file CSV
# Pastikan file 'hasil_tfidf_berita.csv' ada di direktori kerja notebook ini

df = pd.read_csv('hasil_preprocessing_pta_manajemen.csv')

In [35]:
from gensim.models import Word2Vec

In [36]:
import numpy as np

class MyTokenizer:
    def fit_transform(self, texts):
        # Tokenisasi sederhana: lowercase + split
        return [str(text).lower().split() for text in texts]

class MeanEmbeddingVectorizer:
    def __init__(self, word2vec_model):
        self.word2vec = word2vec_model
        # Perbaikan: gunakan vector_size (Gensim ≥ 4.0)
        self.dim = word2vec_model.wv.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tokenized = MyTokenizer().fit_transform(X)
        embeddings = []
        for words in X_tokenized:
            # Ambil vektor hanya untuk kata yang ada di vocab
            valid_vectors = [
                self.word2vec.wv[word] for word in words
                if word in self.word2vec.wv
            ]
            if valid_vectors:
                embeddings.append(np.mean(valid_vectors, axis=0))
            else:
                embeddings.append(np.zeros(self.dim))
        return np.array(embeddings)

    def fit_transform(self, X, y=None):
        return self.transform(X)

In [37]:
# Bangun korpus token dari teks hasil preprocessing: setiap baris -> list kata
# Gunakan kolom 'hasil_preprocessing' jika ada, jika tidak cari alternatif umum
import ast

text_col = 'hasil_preprocessing' if 'hasil_preprocessing' in df.columns else None
if text_col is None:
    possible_text_cols = ['clean', 'text', 'preprocessed', 'kalimat', 'sentence']
    text_col = next((c for c in possible_text_cols if c in df.columns), None)
if text_col is None:
    raise RuntimeError("Kolom teks tidak ditemukan. Pastikan 'hasil_preprocessing' atau kolom teks lain tersedia di CSV.")

# Parser aman: dukung format list-string seperti "['kata', 'kata2']" atau string biasa
def to_tokens(value):
    if isinstance(value, list):
        return [str(tok).lower() for tok in value if str(tok).strip()]
    if isinstance(value, str):
        s = value.strip()
        if s.startswith('[') and s.endswith(']'):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return [str(tok).lower() for tok in parsed if str(tok).strip()]
            except Exception:
                pass
        return [tok for tok in s.lower().split() if tok]
    return []

texts = df[text_col].fillna("").tolist()
corpus = [tokens for tokens in (to_tokens(v) for v in texts) if tokens]

if len(corpus) == 0:
    raise RuntimeError("Corpus kosong: kolom teks ada tetapi tidak berisi token valid.")

# Contoh token dokumen pertama
corpus[0:1]

[['aplikasi',
  'nyata',
  'manfaat',
  'teknologi',
  'informasi',
  'komunikasi',
  'bidang',
  'layan',
  'administrasi',
  'akademik',
  'guru',
  'salah',
  'satu',
  'sistem',
  'portal',
  'akademik',
  'universitas',
  'trunojoyo',
  'madura',
  'implementasi',
  'proses',
  'selenggara',
  'temu',
  'kendala',
  'teknis',
  'non',
  'teknis',
  'teliti',
  'tuju',
  'puas',
  'langgan',
  'dasar',
  'analisis',
  'indeks',
  'puas',
  'langgan',
  'tinjau',
  'webqual',
  'fokus',
  'baik',
  'mutu',
  'layan',
  'website',
  'portal',
  'akademik',
  'universitas',
  'trunjoyo',
  'madura',
  'dasar',
  'importance',
  'performance',
  'analysis',
  'tinjau',
  'webqual',
  'teliti',
  'teliti',
  'kuantitaif',
  'deskriptif',
  'gambar',
  'deskripsi',
  'lukis',
  'sistematis',
  'faktual',
  'akurat',
  'kualitas',
  'layan',
  'jasa',
  'online',
  'mahasiswasampelpenelitianiniadalahmahasiswaangkatan',
  'teknik',
  'sampel',
  'teknik',
  'stratified',
  'random',
  'sam

## Tokenisasi dan Pelatihan Word2Vec (CBOW)
- Tokenisasi setiap baris `clean` menjadi list kata → `corpus`
- Latih model Word2Vec dengan default CBOW (`sg=0`) dan `vector_size=56`
- Hasil: embedding vektor untuk setiap kata di vocabulary


In [38]:
df.shape

(1026, 2)

In [39]:
# Latih model Word2Vec (CBOW by default: sg=0) dengan ukuran vektor 56
# Menggunakan korpus token yang dibangun dari fitur TF-IDF (nilai > 0)
model = Word2Vec(corpus, min_count=1, vector_size=56)

In [40]:
# (Opsional) contoh eksplorasi embeddings kata jika diperlukan
# model.wv.most_similar('eric')
# model.wv.most_similar_cosmul(positive=['phone', 'number'], negative=['call'])
# model.wv.doesnt_match("phone number prison cell".split())

# Simpan embeddings kata yang dilatih
filename = 'pta_embd.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [41]:
# Mean embedding per dokumen: rata-rata vektor kata dari token TF-IDF
mean_embedding_vectorizer = MeanEmbeddingVectorizer(model)
# Gabungkan token menjadi string kalimat agar tokenizer bekerja sama seperti sebelumnya
joined_docs = [" ".join(tokens) for tokens in corpus]
mean_embedded = mean_embedding_vectorizer.fit_transform(joined_docs)

In [42]:
# Simpan vektor dokumen ke kolom 'array'
df['array']=list(mean_embedded)

## Rata-rata Embedding per Dokumen
- Gunakan `MeanEmbeddingVectorizer` untuk merata-ratakan vektor kata per dokumen
- Jika dokumen tidak punya kata di vocab, isi vektor nol berdimensi 56


In [43]:
df.head(5)

Unnamed: 0,abstrak_id,hasil_preprocessing,array
0,Aplikasi nyata pemanfaatan teknologi informasi...,"['aplikasi', 'nyata', 'manfaat', 'teknologi', ...","[-0.14074793, -0.0517708, -0.4923049, 0.479526..."
1,Tujuan penelitian ini adalah untuk mengetahui ...,"['tuju', 'teliti', 'persepsi', 'brand', 'assoc...","[-0.4378809, 0.22917439, -0.6416591, 0.4573508..."
2,"ABSTRAK\n\nSatiyah, Pengaruh Faktor-faktor Pel...","['abstrak', 'satiyah', 'pengaruh', 'faktorfakt...","[-0.03672818, 0.1733005, -0.79017997, 0.667239..."
3,Abstrak\n\nPenelitian ini menggunakan metode k...,"['abstrak', 'teliti', 'metode', 'kuantitatif',...","[-0.02975462, 0.7633891, -0.8057022, 0.9496454..."
4,Hasil dari penelitian ini dari perhitungan Cre...,"['hasil', 'teliti', 'hitung', 'credit', 'risk'...","[-0.6008147, 0.42868954, 0.10390902, 0.4882784..."


In [44]:
df['embedding_length'] = df['array'].str.len()

In [45]:
print(df['embedding_length'])

0       56
1       56
2       56
3       56
4       56
        ..
1021    56
1022    56
1023    56
1024    56
1025    56
Name: embedding_length, Length: 1026, dtype: int64


## Bentuk DataFrame Fitur f1..f56 dan Tambah Label
- Ekstrak setiap dimensi embedding ke kolom `f1..f56`
- Tambahkan label `spam` dari dataset asli


In [46]:
df.shape

(1026, 4)

In [47]:
num_features = len(df['array'].iloc[0])  # asumsi semua list punya panjang sama
columns = [f'f{i+1}' for i in range(num_features)]

# Inisialisasi dictionary untuk menampung data per kolom
data_dict = {col: [] for col in columns}

# Looping setiap baris di kolom 'embedding'
for embedding_list in df['array']:
    for i, value in enumerate(embedding_list):
        data_dict[f'f{i+1}'].append(value)

# Buat DataFrame dari dictionary
embedding_df = pd.DataFrame(data_dict)

print(embedding_df)

            f1        f2        f3        f4        f5        f6        f7  \
0    -0.140748 -0.051771 -0.492305  0.479526 -0.022363 -0.183089  0.041997   
1    -0.437881  0.229174 -0.641659  0.457351 -0.159472  0.010075  0.198183   
2    -0.036728  0.173301 -0.790180  0.667239  0.171084 -0.218474 -0.241751   
3    -0.029755  0.763389 -0.805702  0.949645  0.183030 -0.264420 -0.026237   
4    -0.600815  0.428690  0.103909  0.488278  0.384397 -0.502497  0.201040   
...        ...       ...       ...       ...       ...       ...       ...   
1021  0.006839  0.626199 -0.550933  1.095289  0.475204 -0.599695 -0.084472   
1022 -0.858705  1.139583  0.052785  0.690584  0.220518 -0.285051  0.573112   
1023 -0.399686  0.106118 -0.723282  0.704096 -0.076959 -0.217982  0.397502   
1024 -0.232048  0.671447 -0.503945  0.686026  0.035947 -0.231157  0.273855   
1025  0.076756  0.225883 -0.695328  0.897530  0.196013 -0.275766 -0.436971   

            f8        f9       f10  ...       f47       f48    

In [48]:
# Gunakan label abstrak_id dari dataset TF-IDF berita
embedding_df['abstrak_id'] = df['abstrak_id'].values  

## Simpan Hasil ke CSV (Opsional)
Simpan `embedding_df` ke file CSV untuk digunakan di proses selanjutnya.


In [49]:
# Simpan DataFrame fitur dokumen ke CSV (opsional)
embedding_df.to_csv('pta_doc_embeddings.csv', index=False, encoding='utf-8')
print('Disimpan ke pta_doc_embeddings.csv')


Disimpan ke pta_doc_embeddings.csv


In [50]:
embedding_df

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f48,f49,f50,f51,f52,f53,f54,f55,f56,abstrak_id
0,-0.140748,-0.051771,-0.492305,0.479526,-0.022363,-0.183089,0.041997,-0.609430,-0.481395,-0.350944,...,0.161481,-0.394125,0.276370,0.196544,0.208528,-0.158757,0.138123,0.082887,0.212058,Aplikasi nyata pemanfaatan teknologi informasi...
1,-0.437881,0.229174,-0.641659,0.457351,-0.159472,0.010075,0.198183,-0.805354,-0.385771,-0.650644,...,0.164984,-0.319574,0.122323,0.607221,0.244385,-0.260579,0.291026,-0.282121,0.282154,Tujuan penelitian ini adalah untuk mengetahui ...
2,-0.036728,0.173301,-0.790180,0.667239,0.171084,-0.218474,-0.241751,-0.599657,-0.709189,-0.298709,...,0.593129,-0.413709,0.366503,0.035847,0.449005,-0.331849,0.187013,-0.309448,-0.107205,"ABSTRAK\n\nSatiyah, Pengaruh Faktor-faktor Pel..."
3,-0.029755,0.763389,-0.805702,0.949645,0.183030,-0.264420,-0.026237,-0.591126,-0.626737,-0.379797,...,0.644052,-0.755638,0.689247,0.063602,0.538198,-0.414120,0.216164,-0.449829,-0.189202,Abstrak\n\nPenelitian ini menggunakan metode k...
4,-0.600815,0.428690,0.103909,0.488278,0.384397,-0.502497,0.201040,-0.327617,-0.314530,-0.474362,...,0.346818,-0.452187,-0.128411,0.076575,0.336062,0.271128,0.203809,0.289892,-0.006533,Hasil dari penelitian ini dari perhitungan Cre...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,0.006839,0.626199,-0.550933,1.095289,0.475204,-0.599695,-0.084472,-0.297925,-0.696287,-0.105851,...,0.861716,-0.928876,0.550203,-0.393145,0.875928,-0.219696,0.011394,-0.281891,-0.418071,"ABSTRAK\nUswatun Hasanah, 160211100291, Pengar..."
1022,-0.858705,1.139583,0.052785,0.690584,0.220518,-0.285051,0.573112,-0.698916,-0.110954,-1.088116,...,0.335618,-0.743358,0.124659,0.500627,0.446752,0.301350,0.474500,0.009493,0.162454,Tujuan penelitian ini adalah untuk mengetahui ...
1023,-0.399686,0.106118,-0.723282,0.704096,-0.076959,-0.217982,0.397502,-0.854790,-0.711533,-0.571515,...,0.089048,-0.707202,0.422979,0.377978,0.334662,-0.458197,0.171744,0.210653,0.320513,ABSTRAK\nPenelitian ini bertujuan: (1) Untuk m...
1024,-0.232048,0.671447,-0.503945,0.686026,0.035947,-0.231157,0.273855,-0.541193,-0.377811,-0.545189,...,0.395498,-0.713329,0.382763,0.275299,0.345721,-0.116190,0.228401,-0.280924,0.009993,ABSTRAK\nTujuan dari penelitian ini adalah unt...


In [51]:
embedding_df.shape

(1026, 57)