In [1]:
#library
#General
import pandas as pd
import numpy as np
from statistics import mean
#Preprocessing
import nltk
from nltk.corpus import stopwords #Preprocessing stopword
from nltk.tokenize import TweetTokenizer #preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re
from scipy.stats import entropy
#Test split
from sklearn.model_selection import KFold,cross_val_score
#metrics
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
#Untuk Gain Ratio
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
#Untuk TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
#Untuk Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [2]:
#mengubah excel menjadi csv
# df = pd.read_excel('data 500 pn fix testing.xlsx')
# df.to_csv('data 500 pn fix tesing.csv', index=False)

In [3]:
#Dataset
df_raw = pd.read_csv ('data 500 pn fix.csv') #data positif negatif 500
df_raw.drop(['No'],axis=1,inplace=True)

In [4]:
df_raw.head()

Unnamed: 0,tweet,Sentimen
0,a @jaejenay ga kerasa udh setahun ak sekolah o...,1
1,"@subtanyarl Wkwkwk biar lebih hemat lah nder, ...",1
2,a @innerchild_ug saking lamanya sekolah online...,1
3,a @subtanyarl Gua juga pen sekolah offline. Tp...,1
4,@indo_Osky Wow diem2 selama sekolah online jad...,1


# Preprocessing

In [5]:
def preprocess(tweet):
    #remove angka
    tweet = re.sub('[0-9]+', '', tweet)
    
    #menghilangkan url
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    
    #menghilangkan @username
    tweet = re.sub('@[^\s]+','',tweet)  
    
    #menghilangkan #hashtags
    tweet = re.sub(r'#[^\s]+', '', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    # Repeating words like hellloooo
    repeat_char = re.compile(r"(.)\1{1,}", re.IGNORECASE)
    tweet = repeat_char.sub(r"\1\1", tweet)
    tweet = re.sub("(.)\\1{2,}", "\\1", tweet)
    
    #remove tanda baca
    tweet = re.sub(r'[^\w]', ' ', tweet)

    #Mengubah ke huruf kecil
    tweet = tweet.lower()
    
    return tweet

df_raw['Preprocess'] = df_raw['tweet'].apply(lambda x: preprocess(x))

In [6]:
#Stopword unik
stopword_extend = open('stopword.txt','r').read().split()
#exclude stopwords
exc_stopwords = {'tidak', 'kurang'}

In [7]:
#stemming unik
import ast
file = open("stemmer_unik.txt", "r")
contents = file.read()
stemmer_uniq = ast.literal_eval(contents)
file.close()

stemmer_uniq

{'ade': 'adik',
 'adek': 'adik',
 'adekku': 'adik',
 'adekkuu': 'adik',
 'adenya': 'adik',
 'agk': 'agak',
 'ahahahah': 'ahahah',
 'aje': 'aja',
 'ak': 'aku',
 'akny': 'aku',
 'akuu': 'aku',
 'allahh': 'allah',
 'anjg': 'ajg',
 'anjirr': 'anjir',
 'anjr': 'anjir',
 'ank': 'anak',
 'anknya': 'anak',
 'anyg': 'ajg',
 'ap': 'apa',
 'aplgi': 'apalagi',
 'ayoo': 'ayo',
 'banya': 'banyak',
 'bdua': 'berdua',
 'bebass': 'bebas',
 'becanda': 'bercanda',
 'beneran': 'bener',
 'besaf': 'besar',
 'bestnya': 'best',
 'bet': 'banget',
 'bgi': 'bagi',
 'bgitu': 'gitu',
 'bgt': 'banget',
 'bgtt': 'banget',
 'bgtu': 'gitu',
 'bimbelnya': 'bimbel',
 'bjir': 'anjir',
 'bkn': 'bukan',
 'blg': 'bilang',
 'blgnya': 'bilangnya',
 'blh': 'boleh',
 'bljr': 'belajar',
 'blm': 'belum',
 'bln': 'bulan',
 'bngt': 'banget',
 'bnyk': 'banyak',
 'bodo': 'bodoh',
 'bole': 'boleh',
 'boleeh': 'boleh',
 'bosen': 'bosan',
 'br': 'baru',
 'bru': 'baru',
 'bs': 'bisa',
 'bsa': 'bisa',
 'bt': 'bete',
 'bukak': 'buka',
 'by

In [8]:
#Stopword, Stemming, dan Tokenisasi
def clean_tweets(Preprocess):
    stopwords_indonesia = set(stopwords.words('indonesian')).difference(exc_stopwords)
    new_stopwords_indonesia = stopwords_indonesia.union(stopword_extend)
    stopwords_english = stopwords.words('english')
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    #remove coma
    Preprocess = re.sub(r",",'',Preprocess)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(Preprocess)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in new_stopwords_indonesia): 
            if (word not in stopwords_english):# remove stopwords
                if (word in stemmer_uniq) : 
                    stemed_word = word
                    stemed_word = stemed_word.replace(word, stemmer_uniq[word]) # stemming word unik
                    stemed_word = stemmer.stem(stemed_word) #stemming library
                    tweets_clean.append(stemed_word)
                else :
                    stem_word = stemmer.stem(word)
                    tweets_clean.append(stem_word)

    return tweets_clean
df_raw['Token'] = df_raw['Preprocess'].apply(lambda x: clean_tweets(x))

In [9]:
df_raw

Unnamed: 0,tweet,Sentimen,Preprocess,Token
0,a @jaejenay ga kerasa udh setahun ak sekolah o...,1,a ga kerasa udh setahun ak sekolah online ud...,"[gak, rasa, udah, tahun, aku, sekolah, online,..."
1,"@subtanyarl Wkwkwk biar lebih hemat lah nder, ...",1,wkwkwk biar lebih hemat lah nder apalagi sek...,"[wkwk, biar, hemat, nder, sekolah, ama, kuliah..."
2,a @innerchild_ug saking lamanya sekolah online...,1,a saking lamanya sekolah online sampe lupa ta...,"[saking, sekolah, online, sampe, lupa, tanggal..."
3,a @subtanyarl Gua juga pen sekolah offline. Tp...,1,a gua juga pen sekolah offline tpi enak onli...,"[gue, ken, sekolah, offline, tapi, enak, onlin..."
4,@indo_Osky Wow diem2 selama sekolah online jad...,1,wow diem selama sekolah online jadi artis wkwkwk,"[wow, diam, sekolah, online, artis, wkwk]"
5,klo disuru milih mending sklh online apa offli...,1,klo disuru milih mending sklh online apa offli...,"[kalo, disuru, milih, mending, sekolah, online..."
6,"gw pengen sehari aja di sekolah, kek dulu sblm...",1,gw pengen sehari aja di sekolah kek dulu sblm...,"[gue, ken, hari, aja, sekolah, kek, belum, pan..."
7,lucu juga sih masnya nyalahin sekolah online b...,1,lucu juga sih masnya nyalahin sekolah online b...,"[lucu, sih, mas, nyalahin, sekolah, online, bi..."
8,Sekolah lain US nya offline. Untung aku masih ...,1,sekolah lain us nya offline untung aku masih ...,"[sekolah, us, offline, untung, online]"
9,"prefer sekolah onnline, ujian online sih https...",1,prefer sekolah onnline ujian online sih,"[prefer, sekolah, onnline, uji, online, sih]"


In [10]:
Token = df_raw['Token']
Sentimen = df_raw['Sentimen']
Preprocess = df_raw['Preprocess']

# Feature Extraction

In [11]:
count_vector = CountVectorizer(analyzer=lambda x: x)
count_vector.fit(Token)
doc_array = count_vector.transform(Token).toarray()

In [12]:
frequency_matrix = pd.DataFrame(doc_array,columns=count_vector.get_feature_names())
df_gr = frequency_matrix #mengubah df menjadi bag of words
df_gr.insert(len(df_gr.columns), 'Sentimen', df_raw['Sentimen']) #menambahkan kolom sentimen ke dataframe gain ratio

In [13]:
df_gr

Unnamed: 0,aamiin,abang,abis,absen,adakan,adaptasi,addict,adeku,adik,adil,...,year,yeayy,yes,yoi,yok,youtube,yt,zain,zoom,Sentimen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Gain Ratio

In [56]:
# Mencari nilai Information gain menggunakan mutual_info_classif
X = df_gr.drop(['Sentimen'], axis=1)
y = df_gr.Sentimen

In [57]:
mutual_info = mutual_info_classif(X, y)
mutual_info

array([0.00140651, 0.        , 0.00568014, ..., 0.        , 0.        ,
       0.01540273])

In [58]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

adik          0.071202
goda          0.069083
alah          0.068218
dok           0.066894
aplikasi      0.066364
tentang       0.065591
tolong        0.063492
contek        0.063206
pr            0.061065
jumpa         0.058304
kurang        0.058070
tenaga        0.056261
maneh         0.055982
wkwkwkwk      0.055942
gara          0.055715
lanjur        0.054729
hikds         0.053940
sibuk         0.053457
muluk         0.053204
ekonomi       0.052115
tuju          0.051817
subschfess    0.050825
eksperimen    0.050252
biaya         0.050003
pandang       0.049814
spm           0.049764
ngatur        0.049439
youtube       0.048683
sambung       0.048445
mager         0.047198
                ...   
prinsip       0.000000
intn          0.000000
print         0.000000
private       0.000000
interaksi     0.000000
poll          0.000000
jalan         0.000000
perkara       0.000000
pingin        0.000000
persis        0.000000
pesantren     0.000000
pesona        0.000000
jeda       

In [59]:
#menghitung nilai split info
total = X.sum(axis=0)

prob_dict = {}
for column in X:
    prob_dict[column] = []
    for row in X[column]:
        if total[column] == 0 :
            prob_dict[column].append(0.0)
        else :
            prob = row / total[column]
            prob_dict[column].append(prob)

entropy_dict = {}

for term in prob_dict:
    entropy_dict[term] = entropy(prob_dict[term], base=2)
    
entropy_dict

{'aamiin': 0.0,
 'abang': 0.0,
 'abis': 1.584962500721156,
 'absen': 2.3219280948873626,
 'adakan': 0.0,
 'adaptasi': 1.0,
 'addict': 0.0,
 'adeku': 0.0,
 'adik': 2.584962500721156,
 'adil': 0.0,
 'adkel': 0.0,
 'aduh': 1.0,
 'agak': 0.0,
 'ah': 1.0,
 'ahahah': 1.0,
 'ain': 0.0,
 'aja': 6.398781141111593,
 'ajaib': 0.0,
 'ajak': 2.0,
 'ajang': 0.0,
 'ajar': 5.957142777383321,
 'ajg': 0.0,
 'akal': 0.0,
 'akibat': 1.0,
 'akses': 3.0,
 'aktif': 0.0,
 'aktivis': 0.0,
 'aktivitas': 0.0,
 'aku': 2.8073549220576046,
 'alah': 0.0,
 'alam': 1.0,
 'alami': 0.0,
 'alas': 1.584962500721156,
 'alesan': 1.0,
 'alfa': 0.0,
 'alhasil': 0.0,
 'alias': 2.0,
 'alih': 0.0,
 'allah': 2.584962500721156,
 'allo': 0.0,
 'ama': 1.0,
 'aman': 1.0,
 'ambil': 1.0,
 'ambis': 0.0,
 'amburadul': 0.0,
 'amin': 0.0,
 'ampun': 0.0,
 'anak': 6.309425513491128,
 'ancam': 0.0,
 'ancang': 0.0,
 'ancur': 0.0,
 'andai': 1.0,
 'anggap': 0.9182958340544894,
 'angka': 0.9182958340544894,
 'angkasa': 0.0,
 'angket': 0.0,
 'angk

In [60]:
#Menghitung nilai gain ratio
gain_ratio_dict = {}
for column in X:
    if mutual_info[column] != 0 and entropy_dict[column] == 0:
        gain_ratio_dict[column] = 0
    else:
        gain_ratio_dict[column] = mutual_info[column]/entropy_dict[column]

  import sys


In [62]:
pd.set_option('mode.use_inf_as_na', True)
gain_ratio_dict = pd.Series(gain_ratio_dict).dropna() #menghapus nilai NaN
gr_max = gain_ratio_dict.sort_values(ascending=False)
gr_max #seluruh fitur gr

goda          0.069083
tentang       0.065591
contek        0.063206
tenaga        0.056261
muluk         0.053204
ekonomi       0.052115
vibesnya      0.046382
maaf          0.044230
faham         0.043590
erti          0.042746
mental        0.039181
alam          0.039172
adaptasi      0.038617
toko          0.035691
lanjur        0.034530
pr            0.034067
angka         0.033891
gerak         0.033832
minim         0.033503
pandang       0.033209
youtube       0.032455
boleh         0.030143
ilmu          0.029778
matematika    0.028243
introvert     0.027756
adik          0.027545
penuh         0.026792
semua         0.026735
akibat        0.026554
nanya         0.026433
                ...   
muncul        0.000000
ngatur        0.000000
ngegambar     0.000000
nyambung      0.000000
ngurusin      0.000000
nyalin        0.000000
nuansa        0.000000
ntah          0.000000
notabennya    0.000000
normal        0.000000
nongkrong     0.000000
nolak         0.000000
ning       

In [20]:
gr_500 = gr_max.head(500) #500 fitur teratas

In [21]:
gr_1000 = gr_max.head(1000) #1000 fitur teratas

In [63]:
#membuat dataframe baru berdasarkan top feature

df_gr[gr_max.index] #gr_500 dan gr_1000 disini
df_gr_traintest = df_gr[gr_max.index]
df_gr_traintest.insert(len(df_gr_traintest.columns),column='Sentimen',value=df_raw.Sentimen)
df_gr_traintest.head()

Unnamed: 0,goda,tentang,contek,tenaga,muluk,ekonomi,vibesnya,maaf,faham,erti,...,ngezoom,ngerti,ngerokok,ngerasain,ngerasa,ngembangin,ngeliat,ngejelasin,aamiin,Sentimen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [73]:
#Confussion Matrix & Akurasi
X = df_gr_traintest.drop(['Sentimen'], axis=1)
y = df_gr_traintest.Sentimen

kf = KFold(n_splits=10, shuffle=True, random_state=0)
conf_matrix=np.zeros([2,2])
akurasi = []
precision = []
recall = []
f1 = []
data_test_gr = []
pred_result_gr = []

model = MultinomialNB()

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    confusion_matrix(y_test, pred)
    accuracy_score(y_test, pred)
    precision_score(y_test, pred, average='micro')
    recall_score(y_test, pred, average='micro')
    f1_score(y_test, pred, average='micro')
    
    akurasi.append(accuracy_score(y_test, pred))
    precision.append(precision_score(y_test, pred, average='micro'))
    recall.append(recall_score(y_test, pred, average='micro'))
    f1.append(f1_score(y_test, pred, average='micro'))
    
    #List test data index
    data_test_gr.extend(test_index)
    
    #list hasil yang di prediksi
    pred_result_gr.extend(pred)
    
    conf_matrix = conf_matrix + confusion_matrix(y_test, pred)
      
print(conf_matrix.astype(int))
print("\nAkurasi : ", mean(akurasi))
print("Precision : ", mean(precision))
print("Recall : ", mean(recall))
print("F1 : ", mean(f1))

[[176  74]
 [ 89 161]]

Akurasi :  0.6739999999999999
Precision :  0.6739999999999999
Recall :  0.6739999999999999
F1 :  0.6739999999999999


In [24]:
df_tes_gr = df_raw.loc[df_raw.index[data_test_gr]]
df_tes_gr['Predicted'] = pred_result_gr

In [25]:
df_tes_gr

Unnamed: 0,tweet,Sentimen,Preprocess,Token,Predicted
1,"@subtanyarl Wkwkwk biar lebih hemat lah nder, ...",1,wkwkwk biar lebih hemat lah nder apalagi sek...,"[wkwk, biar, hemat, nder, sekolah, ama, kuliah...",2
15,pls sekolah gue juga termasuk cepet banget ada...,1,pls sekolah gue juga termasuk cepet banget ada...,"[pls, sekolah, gue, cepat, banget, adaptasi, o...",2
21,"Betul, makanya aku nggak ikut menolak rencana ...",1,betul makanya aku nggak ikut menolak rencana ...,"[enggak, tolak, rencana, sekolah, buka, ya, se...",1
37,Terus pagi2 ditelpon sama ibuk camer wkwkw. Mi...,1,terus pagi ditelpon sama ibuk camer wkwkw min...,"[pagi, telpon, ibu, camer, wkwk, tolong, besok...",2
45,Malah merasa lebih produktif sejak sekolah onl...,1,malah merasa lebih produktif sejak sekolah onl...,"[produktif, sekolah, online, things, sekolah]",1
46,@adit_wr @BTannadi @Felicia_Putri online ajala...,1,online ajalah mana ada lagi yg beli saham k...,"[online, aja, yang, beli, saham, langsung, kal...",2
76,klo gw di suruh milih sekolah offline apa onli...,1,klo gw di suruh milih sekolah offline apa onli...,"[kalo, gue, suruh, milih, sekolah, offline, on...",1
90,"@erwinarnada Aku pilih online, sekarang aja ak...",1,aku pilih online sekarang aja aku dibuat nga...,"[pilih, online, aja, ngapain, karena, paksa, u...",1
96,50-50\n\nkangen sekolah offline tapi udah luma...,1,kangen sekolah offline tapi udah lumayan ny...,"[kangen, sekolah, offline, udah, lumayan, nyam...",1
102,Ya. Memang ada yg kesulitan sekolah online.\nT...,1,ya memang ada yg kesulitan sekolah online ta...,"[ya, yang, sulit, sekolah, online, yang, gak, ...",2


In [26]:
df_tes_gr.to_excel('Hasil tes gr.xlsx')

# TF-IDF

In [27]:
# max_ft = len(gain_ratio_dict.index) #mengambil fitur tertinggi berdasarkan panjang gain ratio

In [69]:
#max_features=500 #max feature 500 teratas
#max_features=1000 #max feature 1000 teratas
#max_features=1071 #max feature 1071 teratas sama dengan gain ratio
#no max_features seluruh data
tfidf_vector = TfidfVectorizer(analyzer=lambda x:x) #taruh max_features di sini
tfidf_vector.fit_transform(Token)
doc_array2 = tfidf_vector.transform(Token).toarray()

In [70]:
frequency_matrix_tfidf = pd.DataFrame(doc_array2,columns=tfidf_vector.get_feature_names())
df_tfidf = frequency_matrix_tfidf #mengubah df menjadi bag of words, dan menghilangkan multi index
df_tfidf.insert(len(df_tfidf.columns), 'Sentimen', Sentimen) #menambahkan kolom sentimen ke dataframe tfidf

In [71]:
df_tfidf

Unnamed: 0,aamiin,abang,abis,absen,adakan,adaptasi,addict,adeku,adik,adil,...,year,yeayy,yes,yoi,yok,youtube,yt,zain,zoom,Sentimen
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
7,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
8,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
9,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1


In [72]:
#Confussion Matrix & Akurasi
X = df_tfidf.drop(['Sentimen'], axis=1)
y = df_tfidf.Sentimen

kf = KFold(n_splits=10,shuffle=True, random_state=0)
conf_matrix=np.zeros([2,2])
i = 0
akurasi = []
precision = []
recall = []
f1 = []
data_test_tfidf = []
pred_result_tfidf = []
model = MultinomialNB()

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    confusion_matrix(y_test, pred)
    accuracy_score(y_test, pred)
    precision_score(y_test, pred, average='micro')
    recall_score(y_test, pred, average='micro')
    f1_score(y_test, pred, average='micro')
    
    akurasi.append(accuracy_score(y_test, pred))
    precision.append(precision_score(y_test, pred, average='micro'))
    recall.append(recall_score(y_test, pred, average='micro'))
    f1.append(f1_score(y_test, pred, average='micro'))
    #List test data index
    data_test_tfidf.extend(test_index)
    
    #list hasil yang di prediksi
    pred_result_tfidf.extend(pred)
    
    conf_matrix = conf_matrix + confusion_matrix(y_test, pred)
    
print('\n',conf_matrix.astype(int))
print("\nAkurasi : ", mean(akurasi))
print("Precision : ", mean(precision))
print("Recall : ", mean(recall))
print("F1 : ", mean(f1))


 [[172  78]
 [ 81 169]]

Akurasi :  0.6819999999999999
Precision :  0.6819999999999999
Recall :  0.6819999999999999
F1 :  0.6819999999999999


In [32]:
# print("Data Test", data_test_tfidf)
# print(len(data_test_tfidf))
# print("Prediction Result", pred_result_tfidf)

In [33]:
df_tes_tfidf = df_raw.loc[df_raw.index[data_test_tfidf]]
df_tes_tfidf['Predicted'] = pred_result_tfidf