# **Tugas UAS PPW**

## Deployment

[Link Streamlit](https://uasppwpram.streamlit.app/)

## Import Library

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.svm import SVC

import pandas as pd
import warnings
import joblib
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Dataset

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/pramdf042/PPW/main/crawlingberitauas.csv')
df

Unnamed: 0,Judul,Isi,Kategori
0,Pendiri Koperasi di Kudus Gelapkan Dana Nasaba...,"SEMARANG, 10/10 (BeritaJateng.net) – Jajaran D...",Kriminal
1,Cak Imin Yakin Duet AMIN Daftar Capres-Cawapre...,"SERANG– Ketua Umum PKB, Muhaimin Iskandar alia...",Politik
2,Polda Banten Apel Gelar Pasukan Operasi Mantap...,SERANG– Situasi politik menjelang Pemilu 2024...,Politik
3,Erick Thohir Beberkan Tiga Alasan Indonesia Te...,JAKARTA– Ketua Umum Persatuan Sepak Bola Selur...,Olahraga
4,Propam Periksa Penggunaan Senjata Personel Pol...,SERANG– Personel Propam Polres Serang melakuka...,Kriminal
...,...,...,...
1053,"Diserang Netizen Indonesia, Akun Instagram Tim...",SERANG– Tim Nasional (Timnas) Thailand harus m...,Olahraga
1054,Ratusan Off-Roader Motor Trail ‘Ulin Bareng’ d...,LEBAK– Masih dalam rangka memeriahkan Hari Ula...,Olahraga
1055,"Terlibat Kasus Korupsi, Hakim Perintahkan 4 St...",SERANG– Hakim Pengadilan Negeri (PN) Serang me...,Kriminal
1056,Hasil Drawing Piala Dunia U-17 2023: Ini Dia L...,JAKARTA– Hasil drawing Piala Dunia U-17 2023 t...,Olahraga


## Cek NULL Data

In [None]:
df.isnull().sum()

Judul       0
Isi         0
Kategori    0
dtype: int64

## Cleaning

In [None]:
def cleaning(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text).strip()
  return text

df['Cleaning'] = df['Isi'].apply(cleaning)
df['Cleaning']

0       SEMARANG  BeritaJatengnet  Jajaran Direktorat ...
1       SERANG Ketua Umum PKB Muhaimin Iskandar alias ...
2       SERANG  Situasi politik menjelang Pemilu  kian...
3       JAKARTA Ketua Umum Persatuan Sepak Bola Seluru...
4       SERANG Personel Propam Polres Serang melakukan...
                              ...                        
1053    SERANG Tim Nasional Timnas Thailand harus mene...
1054    LEBAK Masih dalam rangka memeriahkan Hari Ulan...
1055    SERANG Hakim Pengadilan Negeri PN Serang menga...
1056    JAKARTA Hasil drawing Piala Dunia U  telah dik...
1057    TANGERANG Pemkot Tangerang bersama dengan Komi...
Name: Cleaning, Length: 1058, dtype: object

## Tokenizing

In [None]:
def tokenizer(text):
  text = text.lower()
  return word_tokenize(text)

df['Tokenizing'] = df['Cleaning'].apply(tokenizer)
df['Tokenizing']

0       [semarang, beritajatengnet, jajaran, direktora...
1       [serang, ketua, umum, pkb, muhaimin, iskandar,...
2       [serang, situasi, politik, menjelang, pemilu, ...
3       [jakarta, ketua, umum, persatuan, sepak, bola,...
4       [serang, personel, propam, polres, serang, mel...
                              ...                        
1053    [serang, tim, nasional, timnas, thailand, haru...
1054    [lebak, masih, dalam, rangka, memeriahkan, har...
1055    [serang, hakim, pengadilan, negeri, pn, serang...
1056    [jakarta, hasil, drawing, piala, dunia, u, tel...
1057    [tangerang, pemkot, tangerang, bersama, dengan...
Name: Tokenizing, Length: 1058, dtype: object

## Stopword Removal

In [None]:
corpus = stopwords.words('indonesian')

def stopwordText(words):
 return [word for word in words if word not in corpus]

df['Stopword Removal'] = df['Tokenizing'].apply(stopwordText)

# Gabungkan kembali token menjadi kalimat utuh
df['Full Text'] = df['Stopword Removal'].apply(lambda x: ' '.join(x))
df['Full Text']

0       semarang beritajatengnet jajaran direktorat re...
1       serang ketua pkb muhaimin iskandar alias cak i...
2       serang situasi politik menjelang pemilu kian m...
3       jakarta ketua persatuan sepak bola indonesia p...
4       serang personel propam polres serang kegiatan ...
                              ...                        
1053    serang tim nasional timnas thailand menelan pi...
1054    lebak rangka memeriahkan ulang republik indone...
1055    serang hakim pengadilan negeri pn serang staf ...
1056    jakarta hasil drawing piala dunia u jumat mala...
1057    tangerang pemkot tangerang komisi pemilihan kp...
Name: Full Text, Length: 1058, dtype: object

## TFIDF

In [None]:
def tfidf(dokumen, category):
  vectorizer = TfidfVectorizer()
  x = vectorizer.fit_transform(dokumen).toarray()
  terms = vectorizer.get_feature_names_out()

  final_tfidf = pd.DataFrame(x, columns=terms)
  final_tfidf.insert(0, 'Dokumen', dokumen)
  final_tfidf.insert(len(final_tfidf.columns),'Kategori', category)

  return (vectorizer, final_tfidf)

tfidf_vectorizer, final_tfidf = tfidf(df['Full Text'], df['Kategori'])
final_tfidf

Unnamed: 0,Dokumen,aa,aadinarayana,aafi,aal,aamiin,aan,aang,aansementara,aardianodia,...,zulfan,zulfikar,zulhas,zulhasdan,zulkarnain,zulkfli,zulkifli,zullfan,zurich,Kategori
0,semarang beritajatengnet jajaran direktorat re...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Kriminal
1,serang ketua pkb muhaimin iskandar alias cak i...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Politik
2,serang situasi politik menjelang pemilu kian m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Politik
3,jakarta ketua persatuan sepak bola indonesia p...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Olahraga
4,serang personel propam polres serang kegiatan ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Kriminal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,serang tim nasional timnas thailand menelan pi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Olahraga
1054,lebak rangka memeriahkan ulang republik indone...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Olahraga
1055,serang hakim pengadilan negeri pn serang staf ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,Kriminal
1056,jakarta hasil drawing piala dunia u jumat mala...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121653,Olahraga


## Penentuan X dan y dengan drop fitur dokumen dan label

In [None]:
X = final_tfidf.drop(['Dokumen', 'Kategori'], axis=1)
X

Unnamed: 0,aa,aadinarayana,aafi,aal,aamiin,aan,aang,aansementara,aardianodia,aaron,...,zul,zulfan,zulfikar,zulhas,zulhasdan,zulkarnain,zulkfli,zulkifli,zullfan,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121653


In [None]:
y = df['Kategori']
y

0       Kriminal
1        Politik
2        Politik
3       Olahraga
4       Kriminal
          ...   
1053    Olahraga
1054    Olahraga
1055    Kriminal
1056    Olahraga
1057     Politik
Name: Kategori, Length: 1058, dtype: object

## Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Modeling With LDA**

### Mencari Best Parameter untuk LDA

In [None]:
def find_best_lda(Xtrain, Xtest, n_components, alpha, beta):
  looping = 1
  best = {'k' : 0, 'alpha' : 0, 'beta' : 0, 'accuracy' : 0, 'model': '', 'lda' : '', 'lda_Xtrain' : '', 'lda_Xtest' : ''}
  history = pd.DataFrame(columns=["Pengujian Ke", "K", "Alpha", "Beta", "Accuracy"])

  # Menambahkan tqdm pada loop terluar
  for k in n_components:
    for a in alpha:
      for b in beta:
        lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=a, topic_word_prior=b)
        lda_Xtrain = lda.fit_transform(Xtrain)
        lda_Xtest = lda.transform(Xtest)

        # Membuat model Naive Bayes
        model = GaussianNB()

        # Melatih model pada data pelatihan
        model.fit(lda_Xtrain, y_train)

        # Melakukan prediksi pada data pengujian
        y_pred = model.predict(lda_Xtest)

        # Menghitung akurasi
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Jumlah Topik: {k}, Alpha: {a}, Beta: {b}, Accuracy: {accuracy}")

        if accuracy > best['accuracy']:
          best['accuracy'] = accuracy
          best['k'] = k
          best['alpha'] = a
          best['beta'] = b
          best['model'] = model
          best['lda'] = lda
          best['lda_Xtrain'] = lda_Xtrain
          best['lda_Xtest'] = lda_Xtest

        history.loc[len(history)] = [f"Pengujian Ke- {looping}", k, a, b, accuracy]
        looping += 1

  return (best, history)

k = [3, 4, 5]
alpha = [0.3, 0.4]
beta = [0.1, 0.2]
best_param, history = find_best_lda(X_train, X_test, k, alpha, beta)

Jumlah Topik: 3, Alpha: 0.3, Beta: 0.1, Accuracy: 0.5235849056603774
Jumlah Topik: 3, Alpha: 0.3, Beta: 0.2, Accuracy: 0.7075471698113207
Jumlah Topik: 3, Alpha: 0.4, Beta: 0.1, Accuracy: 0.4811320754716981
Jumlah Topik: 3, Alpha: 0.4, Beta: 0.2, Accuracy: 0.4811320754716981
Jumlah Topik: 4, Alpha: 0.3, Beta: 0.1, Accuracy: 0.6132075471698113
Jumlah Topik: 4, Alpha: 0.3, Beta: 0.2, Accuracy: 0.7216981132075472
Jumlah Topik: 4, Alpha: 0.4, Beta: 0.1, Accuracy: 0.4811320754716981
Jumlah Topik: 4, Alpha: 0.4, Beta: 0.2, Accuracy: 0.5047169811320755
Jumlah Topik: 5, Alpha: 0.3, Beta: 0.1, Accuracy: 0.660377358490566
Jumlah Topik: 5, Alpha: 0.3, Beta: 0.2, Accuracy: 0.6226415094339622
Jumlah Topik: 5, Alpha: 0.4, Beta: 0.1, Accuracy: 0.6509433962264151
Jumlah Topik: 5, Alpha: 0.4, Beta: 0.2, Accuracy: 0.4811320754716981


In [None]:
best_param

{'k': 4,
 'alpha': 0.3,
 'beta': 0.2,
 'accuracy': 0.7216981132075472,
 'model': GaussianNB(),
 'lda': LatentDirichletAllocation(doc_topic_prior=0.3, n_components=4,
                           topic_word_prior=0.2),
 'lda_Xtrain': array([[0.81400576, 0.02875271, 0.02899897, 0.12824256],
        [0.02797507, 0.02773395, 0.02830335, 0.91598763],
        [0.03392992, 0.03408446, 0.03390842, 0.8980772 ],
        ...,
        [0.03042602, 0.58045603, 0.0303813 , 0.35873665],
        [0.03007573, 0.2613562 , 0.03008677, 0.67848129],
        [0.8945596 , 0.03052227, 0.03121202, 0.0437061 ]]),
 'lda_Xtest': array([[0.86909118, 0.02214132, 0.0224981 , 0.0862694 ],
        [0.03767905, 0.04256563, 0.04526212, 0.87449319],
        [0.04070606, 0.03852152, 0.13726128, 0.78351113],
        [0.8957606 , 0.02334168, 0.02306998, 0.05782773],
        [0.03619529, 0.03441645, 0.04180469, 0.88758357],
        [0.03877075, 0.11545327, 0.03900662, 0.80676936],
        [0.84188994, 0.04011718, 0.04017302, 0

### Deklarasi K, Alpha, dan Beta

### LDA

In [None]:
lda = best_param['lda']
lda_x_train = best_param['lda_Xtrain']
lda_x_test = best_param['lda_Xtest']

### Tampilan Hasil Reduksi Dimensi

In [None]:
topik_columns = [f"Topik {i}" for i in range(1, best_param['k']+1)]
dokumen = final_tfidf['Dokumen']
output_proporsi_TD = pd.DataFrame(lda_x_train, columns=topik_columns)
output_proporsi_TD.insert(0,'Dokumen', dokumen)
output_proporsi_TD.insert(len(output_proporsi_TD.columns),'Kategori', final_tfidf['Kategori'])
output_proporsi_TD

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Topik 4,Kategori
0,semarang beritajatengnet jajaran direktorat re...,0.814006,0.028753,0.028999,0.128243,Kriminal
1,serang ketua pkb muhaimin iskandar alias cak i...,0.027975,0.027734,0.028303,0.915988,Politik
2,serang situasi politik menjelang pemilu kian m...,0.033930,0.034084,0.033908,0.898077,Politik
3,jakarta ketua persatuan sepak bola indonesia p...,0.036072,0.036201,0.688002,0.239725,Olahraga
4,serang personel propam polres serang kegiatan ...,0.049005,0.037726,0.038387,0.874882,Kriminal
...,...,...,...,...,...,...
841,lebak teriakan gemoy warga lebak mewarnai keda...,0.039352,0.037178,0.615303,0.308167,Politik
842,jakarta mahkamah konstitusi mk membacakan putu...,0.032050,0.646280,0.031561,0.290109,Politik
843,semarang beritajatengnet dewan pengurus daerah...,0.030426,0.580456,0.030381,0.358737,Politik
844,serang walikota serang syafrudin menyambut ked...,0.030076,0.261356,0.030087,0.678481,Olahraga


### Save Data hasil LDA

In [None]:
output_proporsi_TD.to_csv('hasil LDA Uas (NB).csv', index=False)

### Tampilan proporsi kata di tiap topik

In [None]:
# Output distribusi kata pada topik
distribusi_kata_topik = pd.DataFrame(lda.components_)
distribusi_kata_topik

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26039,26040,26041,26042,26043,26044,26045,26046,26047,26048
0,0.200064,0.250569,0.200165,0.200171,0.2,0.200048,0.200171,0.200114,0.2,0.243658,...,0.200103,0.200185,0.2,0.200167,0.200167,0.200054,0.200167,0.200144,0.200157,0.321363
1,0.200072,0.200071,0.200123,0.20019,0.2,0.200359,0.20019,0.200131,0.2,0.200115,...,0.200095,0.200164,0.2,0.200106,0.200106,0.53816,0.200106,0.200164,0.200139,0.20007
2,0.200069,0.200119,0.200156,0.24838,0.2,0.2001,0.24838,0.200124,0.2,0.200109,...,0.296892,0.2194,0.2,0.260176,0.260176,0.200057,0.260176,0.615662,0.21478,0.200067
3,0.596943,0.20068,0.323058,0.229397,0.2,0.955243,0.229397,0.281487,0.2,0.208264,...,0.201746,0.268534,0.2,0.209098,0.209098,0.202473,0.209098,0.286902,0.229066,0.200152


### Model Naive Bayes With LDA

In [None]:
# Membuat model Naive Bayes
model = best_param['model']

# Melakukan prediksi pada data pengujian
y_pred = model.predict(lda_x_test)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi:", accuracy)

# Menampilkan laporan klasifikasi
print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred))

# Menampilkan matriks kebingungan
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matriks:")
print(confusion)

Akurasi: 0.7216981132075472
Laporan Klasifikasi:
              precision    recall  f1-score   support

    Kriminal       0.54      0.96      0.69        68
    Olahraga       0.97      0.73      0.83        77
     Politik       0.94      0.48      0.63        67

    accuracy                           0.72       212
   macro avg       0.82      0.72      0.72       212
weighted avg       0.82      0.72      0.72       212

Confusion Matriks:
[[65  1  2]
 [21 56  0]
 [34  1 32]]


### Predict

In [None]:
data = ["Sebuah gelombang atlet muda dari berbagai negara telah menghebohkan dunia olahraga dengan penampilan gemilang mereka dalam kompetisi internasional terkini. Para atlet muda ini menunjukkan potensi luar biasa mereka di berbagai cabang olahraga, mengukir prestasi yang memukau dan memberikan harapan akan masa depan yang cerah bagi dunia olahraga. Salah satu momen paling mencolok terjadi dalam kompetisi atletik di Kejuaraan Dunia Junior yang diadakan di Tokyo, Jepang. Sejumlah atlet muda dari berbagai benua menampilkan kemampuan luar biasa mereka, memecahkan rekor dunia junior dan menorehkan prestasi yang memukau. Dalam lomba lari 100 meter, seorang atlet remaja dari Amerika Serikat memperlihatkan kecepatan gemilangnya dengan mencatat waktu yang memecahkan rekor dunia dalam kategori usianya. Selain itu, cabang olahraga renang juga menjadi sorotan utama. Seorang perenang muda dari Australia berhasil mengamankan emas dalam lomba gaya bebas 200 meter, sementara atlet renang muda lainnya dari Tiongkok mengejutkan dunia dengan teknik yang begitu memukau dalam gaya kupu-kupu, memecahkan rekor nasional junior. Tak hanya dalam atletik dan renang, olahraga lainnya seperti bulu tangkis, tenis, dan gulat juga menyaksikan penampilan gemilang dari atlet-atlet muda berbakat. Mereka berhasil menyingkirkan para pesaing dengan kecakapan teknis yang tinggi dan semangat bertanding yang luar biasa. Kepiawaian atlet-atlet muda ini tidak hanya mengundang decak kagum, tetapi juga menandai masa depan yang cerah bagi olahraga dunia. Mereka tidak hanya menjadi inspirasi bagi generasi mendatang, tetapi juga membawa semangat kompetisi yang sehat dan semakin memperkaya variasi dalam dunia olahraga global."]
a = tfidf_vectorizer.transform(data).toarray()
b = lda.transform(a)
model.predict(b)

array(['Olahraga'], dtype='<U8')

### Save Model

In [None]:
joblib.dump(lda, "lda.pkl")
joblib.dump(model, "naive bayes.pkl")

['naive bayes.pkl']

## Modeling Without LDA

### Training Model dengan Dataset Asli

In [None]:
# Membuat model Naive Bayes
nb = GaussianNB()

# Melatih model nbada data pelatihan
nb.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian
y_pred_nb = nb.predict(X_test)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred_nb)
print("Akurasi:", accuracy)

# Menampilkan laporan klasifikasi
print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred_nb))

# Menampilkan matriks kebingungan
confusion = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matriks:")
print(confusion)

Akurasi: 0.9339622641509434
Laporan Klasifikasi:
              precision    recall  f1-score   support

    Kriminal       0.94      0.87      0.90        68
    Olahraga       0.95      0.99      0.97        77
     Politik       0.91      0.94      0.93        67

    accuracy                           0.93       212
   macro avg       0.93      0.93      0.93       212
weighted avg       0.93      0.93      0.93       212

Confusion Matriks:
[[59  3  6]
 [ 1 76  0]
 [ 3  1 63]]


### Predict

In [None]:
data = ["Sebuah gelombang atlet muda dari berbagai negara telah menghebohkan dunia olahraga dengan penampilan gemilang mereka dalam kompetisi internasional terkini. Para atlet muda ini menunjukkan potensi luar biasa mereka di berbagai cabang olahraga, mengukir prestasi yang memukau dan memberikan harapan akan masa depan yang cerah bagi dunia olahraga. Salah satu momen paling mencolok terjadi dalam kompetisi atletik di Kejuaraan Dunia Junior yang diadakan di Tokyo, Jepang. Sejumlah atlet muda dari berbagai benua menampilkan kemampuan luar biasa mereka, memecahkan rekor dunia junior dan menorehkan prestasi yang memukau. Dalam lomba lari 100 meter, seorang atlet remaja dari Amerika Serikat memperlihatkan kecepatan gemilangnya dengan mencatat waktu yang memecahkan rekor dunia dalam kategori usianya. Selain itu, cabang olahraga renang juga menjadi sorotan utama. Seorang perenang muda dari Australia berhasil mengamankan emas dalam lomba gaya bebas 200 meter, sementara atlet renang muda lainnya dari Tiongkok mengejutkan dunia dengan teknik yang begitu memukau dalam gaya kupu-kupu, memecahkan rekor nasional junior. Tak hanya dalam atletik dan renang, olahraga lainnya seperti bulu tangkis, tenis, dan gulat juga menyaksikan penampilan gemilang dari atlet-atlet muda berbakat. Mereka berhasil menyingkirkan para pesaing dengan kecakapan teknis yang tinggi dan semangat bertanding yang luar biasa. Kepiawaian atlet-atlet muda ini tidak hanya mengundang decak kagum, tetapi juga menandai masa depan yang cerah bagi olahraga dunia. Mereka tidak hanya menjadi inspirasi bagi generasi mendatang, tetapi juga membawa semangat kompetisi yang sehat dan semakin memperkaya variasi dalam dunia olahraga global."]
tfidf_matrix = tfidf_vectorizer.transform(data).toarray()
nb.predict(tfidf_matrix)

array(['Olahraga'], dtype='<U8')

### Save Model

In [None]:
joblib.dump(nb, "Naive Bayes (Asli).pkl")

['Naive Bayes (Asli).pkl']

## Save Vectorizer

In [None]:
joblib.dump(tfidf_vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

## SVM with LDA

In [None]:
def find_best_lda(Xtrain, Xtest, n_components, alpha, beta):
  looping = 1
  best = {'k' : 0, 'alpha' : 0, 'beta' : 0, 'accuracy' : 0, 'model': '', 'lda' : '', 'lda_Xtrain' : '', 'lda_Xtest' : ''}
  history = pd.DataFrame(columns=["Pengujian Ke", "K", "Alpha", "Beta", "Accuracy"])

  # Menambahkan tqdm pada loop terluar
  for k in n_components:
    for a in alpha:
      for b in beta:
        lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=a, topic_word_prior=b)
        lda_Xtrain = lda.fit_transform(Xtrain)
        lda_Xtest = lda.transform(Xtest)

        # Membuat model Naive Bayes
        model = SVC()

        # Melatih model pada data pelatihan
        model.fit(lda_Xtrain, y_train)

        # Melakukan prediksi pada data pengujian
        y_pred = model.predict(lda_Xtest)

        # Menghitung akurasi
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Jumlah Topik: {k}, Alpha: {a}, Beta: {b}, Accuracy: {accuracy}")

        if accuracy > best['accuracy']:
          best['accuracy'] = accuracy
          best['k'] = k
          best['alpha'] = a
          best['beta'] = b
          best['model'] = model
          best['lda'] = lda
          best['lda_Xtrain'] = lda_Xtrain
          best['lda_Xtest'] = lda_Xtest

        history.loc[len(history)] = [f"Pengujian Ke- {looping}", k, a, b, accuracy]
        looping += 1

  return (best, history)

k = [3, 4, 5]
alpha = [0.3, 0.4]
beta = [0.1, 0.2]
best_param, history = find_best_lda(X_train, X_test, k, alpha, beta)

Jumlah Topik: 3, Alpha: 0.3, Beta: 0.1, Accuracy: 0.5754716981132075
Jumlah Topik: 3, Alpha: 0.3, Beta: 0.2, Accuracy: 0.5047169811320755
Jumlah Topik: 3, Alpha: 0.4, Beta: 0.1, Accuracy: 0.5235849056603774
Jumlah Topik: 3, Alpha: 0.4, Beta: 0.2, Accuracy: 0.8254716981132075
Jumlah Topik: 4, Alpha: 0.3, Beta: 0.1, Accuracy: 0.7641509433962265
Jumlah Topik: 4, Alpha: 0.3, Beta: 0.2, Accuracy: 0.46226415094339623
Jumlah Topik: 4, Alpha: 0.4, Beta: 0.1, Accuracy: 0.6792452830188679
Jumlah Topik: 4, Alpha: 0.4, Beta: 0.2, Accuracy: 0.7877358490566038
Jumlah Topik: 5, Alpha: 0.3, Beta: 0.1, Accuracy: 0.5377358490566038
Jumlah Topik: 5, Alpha: 0.3, Beta: 0.2, Accuracy: 0.6792452830188679
Jumlah Topik: 5, Alpha: 0.4, Beta: 0.1, Accuracy: 0.5377358490566038
Jumlah Topik: 5, Alpha: 0.4, Beta: 0.2, Accuracy: 0.5660377358490566


In [None]:
best_param

{'k': 3,
 'alpha': 0.4,
 'beta': 0.2,
 'accuracy': 0.8254716981132075,
 'model': SVC(),
 'lda': LatentDirichletAllocation(doc_topic_prior=0.4, n_components=3,
                           topic_word_prior=0.2),
 'lda_Xtrain': array([[0.69748137, 0.25057432, 0.05194431],
        [0.03870925, 0.281669  , 0.67962175],
        [0.04552225, 0.90878162, 0.04569613],
        ...,
        [0.0406304 , 0.9177391 , 0.0416305 ],
        [0.04022824, 0.91619896, 0.04357279],
        [0.86523469, 0.08907293, 0.04569238]]),
 'lda_Xtest': array([[0.79859943, 0.16539216, 0.03600841],
        [0.05319713, 0.86950234, 0.07730053],
        [0.05722323, 0.58682965, 0.35594713],
        [0.83078618, 0.13228957, 0.03692426],
        [0.05063856, 0.88758529, 0.06177615],
        [0.05244387, 0.8655628 , 0.08199333],
        [0.79912796, 0.13709898, 0.06377306],
        [0.04050213, 0.74602514, 0.21347272],
        [0.20358467, 0.7416122 , 0.05480314],
        [0.05500112, 0.85866946, 0.08632943],
        [0.04

### LDA

In [None]:
lda = best_param['lda']
lda_x_train = best_param['lda_Xtrain']
lda_x_test = best_param['lda_Xtest']

In [None]:
topik_columns = [f"Topik {i}" for i in range(1, best_param['k']+1)]
dokumen = final_tfidf['Dokumen']
output_proporsi_TD = pd.DataFrame(lda_x_train, columns=topik_columns)
output_proporsi_TD.insert(0,'Dokumen', dokumen)
output_proporsi_TD.insert(len(output_proporsi_TD.columns),'Kategori', final_tfidf['Kategori'])
output_proporsi_TD

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Kategori
0,semarang beritajatengnet jajaran direktorat re...,0.697481,0.250574,0.051944,Kriminal
1,serang ketua pkb muhaimin iskandar alias cak i...,0.038709,0.281669,0.679622,Politik
2,serang situasi politik menjelang pemilu kian m...,0.045522,0.908782,0.045696,Politik
3,jakarta ketua persatuan sepak bola indonesia p...,0.047994,0.053480,0.898526,Olahraga
4,serang personel propam polres serang kegiatan ...,0.219470,0.631401,0.149129,Kriminal
...,...,...,...,...,...
841,lebak teriakan gemoy warga lebak mewarnai keda...,0.052517,0.071596,0.875888,Politik
842,jakarta mahkamah konstitusi mk membacakan putu...,0.042501,0.913079,0.044420,Politik
843,semarang beritajatengnet dewan pengurus daerah...,0.040630,0.917739,0.041630,Politik
844,serang walikota serang syafrudin menyambut ked...,0.040228,0.916199,0.043573,Olahraga


In [None]:
output_proporsi_TD.to_csv('hasil LDA Uas (SVM).csv', index=False)

In [None]:
# Output distribusi kata pada topik
distribusi_kata_topik = pd.DataFrame(lda.components_)
distribusi_kata_topik

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26039,26040,26041,26042,26043,26044,26045,26046,26047,26048
0,0.200169,0.233097,0.200295,0.20031,0.2,0.200267,0.20031,0.200391,0.2,0.251844,...,0.200263,0.200282,0.2,0.200253,0.200253,0.200159,0.200253,0.200147,0.200276,0.319853
1,0.596715,0.21382,0.322875,0.206644,0.2,0.770541,0.206644,0.212214,0.2,0.200154,...,0.200279,0.20046,0.2,0.200265,0.200265,0.540446,0.200265,0.202844,0.200451,0.201552
2,0.200265,0.204523,0.200332,0.271184,0.2,0.384942,0.271184,0.269251,0.2,0.200148,...,0.298295,0.287542,0.2,0.269029,0.269029,0.200139,0.269029,0.69988,0.243414,0.200248


### Modelling SVM with LDA

In [None]:
# Membuat model Naive Bayes
model = best_param['model']

# Melakukan prediksi pada data pengujian
y_pred = model.predict(lda_x_test)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi:", accuracy)

# Menampilkan laporan klasifikasi
print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred))

# Menampilkan matriks kebingungan
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matriks:")
print(confusion)

Akurasi: 0.8254716981132075
Laporan Klasifikasi:
              precision    recall  f1-score   support

    Kriminal       0.70      0.91      0.79        68
    Olahraga       0.94      0.64      0.76        77
     Politik       0.90      0.96      0.93        67

    accuracy                           0.83       212
   macro avg       0.85      0.83      0.83       212
weighted avg       0.85      0.83      0.82       212

Confusion Matriks:
[[62  2  4]
 [25 49  3]
 [ 2  1 64]]


In [None]:
data = ["Sebuah gelombang atlet muda dari berbagai negara telah menghebohkan dunia olahraga dengan penampilan gemilang mereka dalam kompetisi internasional terkini. Para atlet muda ini menunjukkan potensi luar biasa mereka di berbagai cabang olahraga, mengukir prestasi yang memukau dan memberikan harapan akan masa depan yang cerah bagi dunia olahraga. Salah satu momen paling mencolok terjadi dalam kompetisi atletik di Kejuaraan Dunia Junior yang diadakan di Tokyo, Jepang. Sejumlah atlet muda dari berbagai benua menampilkan kemampuan luar biasa mereka, memecahkan rekor dunia junior dan menorehkan prestasi yang memukau. Dalam lomba lari 100 meter, seorang atlet remaja dari Amerika Serikat memperlihatkan kecepatan gemilangnya dengan mencatat waktu yang memecahkan rekor dunia dalam kategori usianya. Selain itu, cabang olahraga renang juga menjadi sorotan utama. Seorang perenang muda dari Australia berhasil mengamankan emas dalam lomba gaya bebas 200 meter, sementara atlet renang muda lainnya dari Tiongkok mengejutkan dunia dengan teknik yang begitu memukau dalam gaya kupu-kupu, memecahkan rekor nasional junior. Tak hanya dalam atletik dan renang, olahraga lainnya seperti bulu tangkis, tenis, dan gulat juga menyaksikan penampilan gemilang dari atlet-atlet muda berbakat. Mereka berhasil menyingkirkan para pesaing dengan kecakapan teknis yang tinggi dan semangat bertanding yang luar biasa. Kepiawaian atlet-atlet muda ini tidak hanya mengundang decak kagum, tetapi juga menandai masa depan yang cerah bagi olahraga dunia. Mereka tidak hanya menjadi inspirasi bagi generasi mendatang, tetapi juga membawa semangat kompetisi yang sehat dan semakin memperkaya variasi dalam dunia olahraga global."]
a = tfidf_vectorizer.transform(data).toarray()
b = lda.transform(a)
model.predict(b)

array(['Olahraga'], dtype=object)

In [None]:
joblib.dump(lda, "lda svm.pkl")
joblib.dump(model, "svm.pkl")

['svm.pkl']

### Modelling SVM without LDA

In [None]:
# Membuat model Naive Bayes
svm = SVC()

# Melatih model nbada data pelatihan
svm.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian
y_pred_svm = svm.predict(X_test)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred_svm)
print("Akurasi:", accuracy)

# Menampilkan laporan klasifikasi
print("Laporan Klasifikasi:")
print(classification_report(y_test, y_pred_svm))

# Menampilkan matriks kebingungan
confusion = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matriks:")
print(confusion)

Akurasi: 0.9575471698113207
Laporan Klasifikasi:
              precision    recall  f1-score   support

    Kriminal       0.97      0.94      0.96        68
    Olahraga       0.99      0.96      0.97        77
     Politik       0.92      0.97      0.94        67

    accuracy                           0.96       212
   macro avg       0.96      0.96      0.96       212
weighted avg       0.96      0.96      0.96       212

Confusion Matriks:
[[64  0  4]
 [ 1 74  2]
 [ 1  1 65]]


In [None]:
data = ["Sebuah gelombang atlet muda dari berbagai negara telah menghebohkan dunia olahraga dengan penampilan gemilang mereka dalam kompetisi internasional terkini. Para atlet muda ini menunjukkan potensi luar biasa mereka di berbagai cabang olahraga, mengukir prestasi yang memukau dan memberikan harapan akan masa depan yang cerah bagi dunia olahraga. Salah satu momen paling mencolok terjadi dalam kompetisi atletik di Kejuaraan Dunia Junior yang diadakan di Tokyo, Jepang. Sejumlah atlet muda dari berbagai benua menampilkan kemampuan luar biasa mereka, memecahkan rekor dunia junior dan menorehkan prestasi yang memukau. Dalam lomba lari 100 meter, seorang atlet remaja dari Amerika Serikat memperlihatkan kecepatan gemilangnya dengan mencatat waktu yang memecahkan rekor dunia dalam kategori usianya. Selain itu, cabang olahraga renang juga menjadi sorotan utama. Seorang perenang muda dari Australia berhasil mengamankan emas dalam lomba gaya bebas 200 meter, sementara atlet renang muda lainnya dari Tiongkok mengejutkan dunia dengan teknik yang begitu memukau dalam gaya kupu-kupu, memecahkan rekor nasional junior. Tak hanya dalam atletik dan renang, olahraga lainnya seperti bulu tangkis, tenis, dan gulat juga menyaksikan penampilan gemilang dari atlet-atlet muda berbakat. Mereka berhasil menyingkirkan para pesaing dengan kecakapan teknis yang tinggi dan semangat bertanding yang luar biasa. Kepiawaian atlet-atlet muda ini tidak hanya mengundang decak kagum, tetapi juga menandai masa depan yang cerah bagi olahraga dunia. Mereka tidak hanya menjadi inspirasi bagi generasi mendatang, tetapi juga membawa semangat kompetisi yang sehat dan semakin memperkaya variasi dalam dunia olahraga global."]
tfidf_matrix = tfidf_vectorizer.transform(data).toarray()
nb.predict(tfidf_matrix)

array(['Olahraga'], dtype='<U8')

In [None]:
joblib.dump(nb, "SVM (Asli).pkl")

['SVM (Asli).pkl']