In [2]:
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 25.9 MB/s 
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
import nltk
nltk.download('stopwords')

In [3]:
# import library
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import classification_report

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Acquisition

In [6]:
data = pd.read_excel('/content/drive/MyDrive/MSIB_2/Final Project/PA/dataset/dataset.xlsx')

In [7]:
data = data.reset_index(drop=True)

In [8]:
# Melihat 5 data teratas
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"Kamis, 6 Agustus 2020 13:00-14:30 WIB. Simak d...",hoax
1,1,Perdana Menteri (PM) Selandia Baru Jacinda Ard...,valid
2,2,Hasil Periksa Fakta Indri Pramesti Widyaningru...,hoax
3,3,"Padahal, Plt Dinas Pendidikan Klaten Yunanta t...",valid
4,4,Hasil Periksa Fakta Novita Kusuma Wardhani (In...,hoax


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   text        1200 non-null   object
 2   label       1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.2+ KB


In [10]:
data.groupby('label').count()

Unnamed: 0_level_0,Unnamed: 0,text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
hoax,600,600
valid,600,600


In [None]:
data.iloc[836,:]

Unnamed: 0     236
text             0
label         hoax
Name: 836, dtype: object

In [None]:
data = data.drop(836,axis=0)
data = data.reset_index(drop=True)

In [None]:
# Melakukan labeling
cat = []
for i in range(len(data)):
    if data.label[i] == 'valid':
        cat.append(1)
    else:
        cat.append(0)
        
data['cat'] = cat
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,cat
0,0,"Kamis, 6 Agustus 2020 13:00-14:30 WIB. Simak d...",hoax,0
1,1,Perdana Menteri (PM) Selandia Baru Jacinda Ard...,valid,1
2,2,Hasil Periksa Fakta Indri Pramesti Widyaningru...,hoax,0
3,3,"Padahal, Plt Dinas Pendidikan Klaten Yunanta t...",valid,1
4,4,Hasil Periksa Fakta Novita Kusuma Wardhani (In...,hoax,0


In [None]:
# melihat apakah terdapat missing value
data.isnull().sum()

Unnamed: 0    0
text          0
label         0
cat           0
dtype: int64

# Text Preprocessing

## Case Folding

In [None]:
# membuat fungsi case folding
def casefolding(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+','',text)
    text = re.sub(r'[-+]?[0-9]+','',text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('\n',' ',text)
    text = text.strip()
    return text

## Word Normalization

In [None]:
# download corpus akronim
!wget https://raw.githubusercontent.com/ksnugroho/klasifikasi-spam-sms/master/data/key_norm.csv

In [None]:
key_norm = pd.read_csv('key_norm.csv')
key_norm.head(10)

Unnamed: 0,_id,singkat,hasil
0,1,abis,habis
1,2,accent,tekanan
2,3,accept,terima
3,4,accident,kecelakaan
4,5,achievement,prestasi
5,6,acra,acara
6,7,acrany,acaranya
7,8,acrnya,acaranya
8,9,action,aksi
9,10,active,aktif


In [None]:
# membuat fungsi untuk menormalisasi teks
def text_normalize(text):
  text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if (key_norm['singkat'] == word).any() else word for word in text.split()])
  text = str.lower(text)
  return text

## Stopwords Removal

In [None]:
# memanggil stopwords berbahasa indonesia
stopwords_ind = stopwords.words('indonesian')

In [None]:
# membuat fungsi untuk remove stopwords
def remove_stopwords(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)

## Stemming

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat fungsi untuk langkah stemming bahasa indonesia
def stemming(text):
  text = stemmer.stem(text)
  return text

## Text Preprocessing Pipeline

In [None]:
# membuat fungsi text preprocessing
def text_preprocessing(text):
  text = casefolding(text)
  text = text_normalize(text)
  text = remove_stopwords(text)
  text = stemming(text)
  return text

In [None]:
%%time
data['clean_teks'] = data['text'].apply(text_preprocessing)

CPU times: total: 6min 54s
Wall time: 6min 59s


In [None]:
# Melihat apakah terapat duplikasi data
data.duplicated(subset=['clean_teks']).any()

True

In [None]:
# Merapikan data yang memiliki duplikasi
data = data.drop_duplicates(subset=['clean_teks'],keep='first')
data = data.reset_index(drop=True)

In [None]:
# Menyimpan dataset hasil preprocessing ke dalam excel 
data.to_excel('/content/drive/MyDrive/MSIB_2/Final Project/PA/dataset/data_scraping_clean.xlsx')

# Feature Engineering

In [None]:
# Memanggil dataset
data = pd.read_excel('/content/drive/MyDrive/MSIB_2/Final Project/PA/dataset/data_scraping_clean.xlsx')

In [None]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,label,cat,clean_teks
0,0,0,"Kamis, 6 Agustus 2020 13:00-14:30 WIB. Simak d...",hoax,0,kamis agustus wib simak coronavirusfacts factc...
1,1,1,Perdana Menteri (PM) Selandia Baru Jacinda Ard...,valid,1,perdana menteri pm selandia jacinda ardern nya...
2,2,2,Hasil Periksa Fakta Indri Pramesti Widyaningru...,hoax,0,hasil periksa fakta indri pramesti widyaningru...
3,3,3,"Padahal, Plt Dinas Pendidikan Klaten Yunanta t...",valid,1,plt dinas didik klaten yunanta izin sekolah ad...
4,4,4,Hasil Periksa Fakta Novita Kusuma Wardhani (In...,hoax,0,hasil periksa fakta novita kusuma wardhani ins...


In [None]:
data.groupby('cat').count()

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,text,label,clean_teks
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,558,558,558,558,558
1,576,576,576,576,576


## Feature Extraction

TFIDF dan N-Gram

In [None]:
x = data['clean_teks']
y = data['cat']

In [None]:
vec_tf_idf = TfidfVectorizer(ngram_range = (1,1))
vec_tf_idf.fit(x)

x_tf_idf = vec_tf_idf.transform(x)

In [None]:
vec_tf_idf.vocabulary_

{'kamis': 1538,
 'agustus': 42,
 'wib': 3837,
 'simak': 3199,
 'coronavirusfacts': 630,
 'factcheck': 927,
 'perdana': 2583,
 'menteri': 2120,
 'pm': 2662,
 'selandia': 3101,
 'jacinda': 1394,
 'ardern': 187,
 'nyata': 2362,
 'positif': 2696,
 'infeksi': 1312,
 'virus': 3784,
 'corona': 628,
 'hasil': 1168,
 'periksa': 2588,
 'fakta': 936,
 'indri': 1307,
 'pramesti': 2716,
 'widyaningrum': 3843,
 'anggota': 138,
 'komisariat': 1709,
 'mafindo': 1982,
 'universitas': 3714,
 'gunadarma': 1112,
 'puisi': 2775,
 'modern': 2179,
 'tulis': 3645,
 'catherine': 564,
 'omeara': 2400,
 'plt': 2660,
 'dinas': 744,
 'didik': 736,
 'klaten': 1691,
 'yunanta': 3889,
 'izin': 1392,
 'sekolah': 3090,
 'ada': 23,
 'studi': 3309,
 'wisata': 3853,
 'daerah': 649,
 'ppkm': 2707,
 'novita': 2343,
 'kusuma': 1819,
 'wardhani': 3820,
 'institut': 1335,
 'ilmu': 1276,
 'sosial': 3267,
 'politik': 2675,
 'jakarta': 1404,
 'informasi': 1317,
 'salah': 3001,
 'dokter': 790,
 'spesialis': 3278,
 'sakit': 2998,
 

In [None]:
len(vec_tf_idf.get_feature_names())



3908

In [None]:
x1 = x_tf_idf.toarray()
data_tf_idf = pd.DataFrame(x1,columns=vec_tf_idf.get_feature_names())
data_tf_idf.head(10)

Unnamed: 0,aa,abad,abai,abang,abar,abdul,abdullah,abhynewscom,aboe,aborigin,...,zero,zerocovid,zhang,zhejiang,zhuhai,zimbabwe,zon,zoo,zulkiefliemansyah,zulkifli
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.298831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

In [None]:
x_train = np.array(data_tf_idf)
y_train = np.array(y)

In [None]:
x_train.shape

(1134, 3908)

In [None]:
chi2_features = SelectKBest(chi2, k=1000)

x_kbest_features = chi2_features.fit_transform(x_train,y_train)

In [None]:
print('original feature number:',x_train.shape[1])
print('reduced feature number:',x_kbest_features.shape[1])

original feature number: 3908
reduced feature number: 1000


In [None]:
data_new = pd.DataFrame(chi2_features.scores_,columns=['nilai'])

In [None]:
data_new.head(10)

Unnamed: 0,nilai
0,0.281451
1,0.279321
2,0.892009
3,0.215996
4,0.286691
5,1.273515
6,0.325291
7,0.351628
8,0.47119
9,0.298664


In [None]:
features = vec_tf_idf.get_feature_names()
data_new['fitur'] = features



In [None]:
data_new.head(10)

Unnamed: 0,nilai,fitur
0,0.281451,aa
1,0.279321,abad
2,0.892009,abai
3,0.215996,abang
4,0.286691,abar
5,1.273515,abdul
6,0.325291,abdullah
7,0.351628,abhynewscom
8,0.47119,aboe
9,0.298664,aborigin


In [None]:
data_new.sort_values(by='nilai',ascending=False)

Unnamed: 0,nilai,fitur
936,3.969856e+01,fakta
2588,2.766364e+01,periksa
1168,2.619844e+01,hasil
1317,2.096498e+01,informasi
3714,1.817437e+01,universitas
...,...,...
1198,1.049329e-05,hibur
738,5.921719e-06,digital
2903,1.150511e-06,reservasi
1887,2.249592e-07,lemak


In [None]:
# menampilkan fitur- fitur yang terpilih berdasarkan nilai k tertinggi
mask = chi2_features.get_support()
new_feature = []
for bool,f in zip(mask,features):
    if bool:
        new_feature.append(f)
    selected_feature = new_feature
selected_feature

['abai',
 'abdul',
 'acara',
 'adenovirus',
 'adi',
 'adil',
 'adu',
 'agama',
 'agustus',
 'ahli',
 'ahmad',
 'aidit',
 'aids',
 'airbus',
 'airlangga',
 'aisyah',
 'ajak',
 'ajang',
 'akibat',
 'aku',
 'akun',
 'akut',
 'al',
 'alami',
 'alas',
 'alat',
 'aliansi',
 'alir',
 'ambil',
 'amerika',
 'ampel',
 'an',
 'anak',
 'analis',
 'ancam',
 'ancol',
 'and',
 'andika',
 'andini',
 'anggap',
 'anggar',
 'anggota',
 'anggur',
 'angkot',
 'angola',
 'ani',
 'anies',
 'anjlok',
 'antibodi',
 'antisipasi',
 'apa',
 'arab',
 'arabia',
 'aramco',
 'ardi',
 'area',
 'ari',
 'arie',
 'arief',
 'ariel',
 'armando',
 'arsul',
 'arti',
 'artikel',
 'aryalasa',
 'as',
 'asal',
 'asasi',
 'aseanas',
 'asli',
 'asri',
 'atap',
 'atas',
 'awas',
 'ayu',
 'azis',
 'aziz',
 'baca',
 'bagi',
 'bahas',
 'bahri',
 'baik',
 'balap',
 'baliho',
 'balkonjazz',
 'ban',
 'bandara',
 'bandung',
 'bangun',
 'bantah',
 'banyuwangi',
 'barel',
 'barubaru',
 'baswedan',
 'bawa',
 'bawang',
 'bawaslu',
 'bayi',
 '

In [None]:
# Membuat vocabulary baru berdasarkan fitur yang terseleksi
# Ini digunakan untuk mengenerate fitur vector tf-idf pada proses deployment

new_selected_features = {}

for (k,v) in vec_tf_idf.vocabulary_.items():
    if k in selected_feature:
        new_selected_features[k] = v

new_selected_features

{'agustus': 42,
 'factcheck': 927,
 'perdana': 2583,
 'nyata': 2362,
 'infeksi': 1312,
 'virus': 3784,
 'hasil': 1168,
 'periksa': 2588,
 'fakta': 936,
 'indri': 1307,
 'pramesti': 2716,
 'widyaningrum': 3843,
 'anggota': 138,
 'komisariat': 1709,
 'mafindo': 1982,
 'universitas': 3714,
 'gunadarma': 1112,
 'puisi': 2775,
 'tulis': 3645,
 'didik': 736,
 'sekolah': 3090,
 'wisata': 3853,
 'novita': 2343,
 'kusuma': 1819,
 'wardhani': 3820,
 'institut': 1335,
 'ilmu': 1276,
 'sosial': 3267,
 'jakarta': 1404,
 'informasi': 1317,
 'salah': 3001,
 'spesialis': 3278,
 'sakit': 2998,
 'the': 3545,
 'claim': 609,
 'flu': 980,
 'pandemic': 2471,
 'of': 2381,
 'foto': 993,
 'riza': 2941,
 'dwi': 838,
 'tim': 3562,
 'kalimasada': 1531,
 'klaim': 1687,
 'covid': 636,
 'ibadah': 1245,
 'islam': 1371,
 'dasar': 669,
 'data': 672,
 'china': 592,
 'antisipasi': 162,
 'wabah': 3803,
 'pmk': 2663,
 'lamongan': 1840,
 'satu': 3055,
 'sapi': 3040,
 'pasar': 2517,
 'suspek': 3372,
 'hepatitis': 1188,
 'mis

In [None]:
len(new_selected_features)

1000

In [None]:
# menyimpan corpus
pickle.dump(new_selected_features,open('/content/drive/MyDrive/MSIB_2/Final Project/PA/model/selected_features.pkl','wb'))

In [None]:
# Menampilkan Fitur-fitur yang sudah diseleksi 
# beserta nilai vektornya pada keseluruhan data untuk dijalankan pada proses Machine learning

# menampilkan fitur yang terpilih sesuai parameter k yang ditentukan sebelumnya

data_selected_feature = pd.DataFrame(x_kbest_features,columns=selected_feature)
data_selected_feature

Unnamed: 0,abai,abdul,acara,adenovirus,adi,adil,adu,agama,agustus,ahli,...,wujud,ya,yerikho,youtube,yuda,yusril,zahra,zat,zero,zon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.348046,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Modelling

Naive Bayes

In [None]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,label,cat,clean_teks
0,0,0,"Kamis, 6 Agustus 2020 13:00-14:30 WIB. Simak d...",hoax,0,kamis agustus wib simak coronavirusfacts factc...
1,1,1,Perdana Menteri (PM) Selandia Baru Jacinda Ard...,valid,1,perdana menteri pm selandia jacinda ardern nya...
2,2,2,Hasil Periksa Fakta Indri Pramesti Widyaningru...,hoax,0,hasil periksa fakta indri pramesti widyaningru...
3,3,3,"Padahal, Plt Dinas Pendidikan Klaten Yunanta t...",valid,1,plt dinas didik klaten yunanta izin sekolah ad...
4,4,4,Hasil Periksa Fakta Novita Kusuma Wardhani (In...,hoax,0,hasil periksa fakta novita kusuma wardhani ins...


In [None]:
x = selected_x
y = data['cat']

# membuat data train dan data test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3,random_state=42)

In [None]:
print('Banyak data x_train :',len(x_train))
print('Banyak data x_test  :',len(x_test))
print('Banyak data y_train :',len(y_train))
print('Banyak data y_test  :',len(y_test))

Banyak data x_train : 793
Banyak data x_test  : 341
Banyak data y_train : 793
Banyak data y_test  : 341


In [None]:
# Training model
text_algorithm = BernoulliNB()
model_nb = text_algorithm.fit(x_train,y_train)

In [None]:
# menyimpan model ke pickle
pickle.dump(model_nb,open('/content/drive/MyDrive/MSIB_2/Final Project/PA/model/bernoulli_nb.pkl','wb'))

In [None]:
y_pred_nb = text_algorithm.predict(x_test)
y_pred_nb

array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,

# Model Evaluation

In [None]:
# Melihat akurasi dari model yang dibuat
print('Akurasi Naive Bayes:',metrics.accuracy_score(y_test,y_pred_nb))

Akurasi Naive Bayes: 0.9325513196480938


In [None]:
print(classification_report(y_test,y_pred_nb))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93       172
           1       0.88      1.00      0.94       169

    accuracy                           0.93       341
   macro avg       0.94      0.93      0.93       341
weighted avg       0.94      0.93      0.93       341

