### **Case Folding**

In [None]:
import re
import string

In [None]:
#lowercase ulasan
df_ulasan['Ulasan'] = df_ulasan['Ulasan'].str.lower()
print('Hasil Setelah Dilakukan Case Folding: \n')
print(df_ulasan['Ulasan'].head())

Hasil Setelah Dilakukan Case Folding: 

0                  trimksih
1                     keren
2    aplksi sangat membantu
3           sangat membantu
4                   terbaik
Name: Ulasan, dtype: object


In [None]:
#pembersihan data untuk karakter yang tidak digunakan
#hapus simbol-simbol
def remove_review_special(text):
    #hapus tab, baris baru, ans back slice (karakter yang tidak diinginkan di akhir string)
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\'," ")
    #hapus karakter non ASCII (emoticon, chines word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    #hapus mention, link, hastag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    #hapus URL
    return text.replace("http://", " ").replace("https://", " ")

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_review_special)

#hapus angka
def remove_number(text):
    return re.sub(r'\d+', '', text)

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_number)

#hapus tanda baca (punctuation)
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_punctuation)

#hapus spasi kosong pada awal dan akhir teks (leading & trailing)
def remove_whitescape_LT(text):
    return text.strip()

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_whitescape_LT)

#hapus menghapus whitespace (spasi, tab, dan newline) yang berulang-ulang menjadi spasi tunggal
def remove_whitespace_multiple(text):
    return re.sub('\s+', ' ', text)

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_whitespace_multiple)

#hapus single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df_ulasan['Ulasan'] = df_ulasan['Ulasan'].apply(remove_single_char)

In [None]:
df_ulasan['Ulasan']

0                                                trimksih
1                                                   keren
2                                  aplksi sangat membantu
3                                         sangat membantu
4                                                 terbaik
                              ...                        
5254    mudah lewat online  usah kekantor cabang bpjs ...
5255                            respon cepat aplikasi nya
5256                                                   ok
5257                 aplikasi sangat membanturespon cepat
5258       aplikasi ini sangat membantu saya terima kasih
Name: Ulasan, Length: 5259, dtype: object

### **Normalization**

In [None]:
df_kamusslang = pd.read_csv(path)
df_kamusslang.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [None]:
# membuat kamus slang menjadi dictionary
slang_dict = dict(zip(df_kamusslang['slang'], df_kamusslang['formal']))

In [None]:
# fungsi untuk melakukan normalisasi dengan kamus slang
def normalize_slang(text):
    words = text.split()
    normalized_words = []
    for word in words:
        normalized_words.append(slang_dict.get(word, word))
    return ' '.join(normalized_words)

df_ulasan["Ulasan"] = df_ulasan["Ulasan"].apply(normalize_slang)

In [None]:
df_ulasan["Ulasan"]

0                                            terima kasih
1                                                   keren
2                                aplikasi sangat membantu
3                                         sangat membantu
4                                                 terbaik
                              ...                        
5254    mudah lewat online usah ke kantor cabang bpjs ...
5255                            respon cepat aplikasi nya
5256                                                  oke
5257                aplikasi sangat membantu respon cepat
5258       aplikasi ini sangat membantu saya terima kasih
Name: Ulasan, Length: 5259, dtype: object

### **Stemming**

In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
# Membuat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# Perform stemming pada kolom teks
df_ulasan["Ulasan"] = df_ulasan["Ulasan"].apply(lambda x: stemmer.stem(x))

In [None]:
df_ulasan

Unnamed: 0,Username,Rating,Date,Ulasan,Kelas Label,Sentimen
0,Kazen Installasi,4,2/2/2023 23:53,terima kasih,positif,1
1,NAI BORNEO,5,2/2/2023 23:45,keren,positif,1
2,Nunung Nurhayati,5,2/2/2023 23:43,aplikasi sangat bantu,positif,1
3,Nanangjbi Jbi,5,2/2/2023 23:37,sangat bantu,positif,1
4,Nepri Yonce Luke,5,2/2/2023 23:31,baik,positif,1
...,...,...,...,...,...,...
5254,ARRAFLI channel,5,1/2/2023 3:29,mudah lewat online usah ke kantor cabang bpjs ...,positif,1
5255,Niko Sanivan,5,1/2/2023 1:32,respon cepat aplikasi nya,positif,1
5256,Hadi Sutan Mudo,5,1/2/2023 1:31,oke,positif,1
5257,Siska Amanda,5,1/2/2023 1:17,aplikasi sangat bantu respon cepat,positif,1


### **Tokenization**

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Memisahkan setiap kata
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df_ulasan['Ulasan_Tokenization'] = df_ulasan['Ulasan'].apply(word_tokenize_wrapper)

print('Hasil Tokenization: \n')
print(df_ulasan['Ulasan_Tokenization'])

Hasil Tokenization: 

0                                         [terima, kasih]
1                                                 [keren]
2                               [aplikasi, sangat, bantu]
3                                         [sangat, bantu]
4                                                  [baik]
                              ...                        
5254    [mudah, lewat, online, usah, ke, kantor, caban...
5255                       [respon, cepat, aplikasi, nya]
5256                                                [oke]
5257             [aplikasi, sangat, bantu, respon, cepat]
5258    [aplikasi, ini, sangat, bantu, saya, terima, k...
Name: Ulasan_Tokenization, Length: 5259, dtype: object


### **Stopword Removal**

In [None]:
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Menginisialisasi daftar stopword dari Sastrawi dan NLTK
factory = StopWordRemoverFactory()
stopwords_sastrawi = factory.get_stop_words()
stopwords_nltk = stopwords.words('indonesian')

In [None]:
stopwords_sastrawi

['yang',
 'untuk',
 'pada',
 'ke',
 'para',
 'namun',
 'menurut',
 'antara',
 'dia',
 'dua',
 'ia',
 'seperti',
 'jika',
 'jika',
 'sehingga',
 'kembali',
 'dan',
 'tidak',
 'ini',
 'karena',
 'kepada',
 'oleh',
 'saat',
 'harus',
 'sementara',
 'setelah',
 'belum',
 'kami',
 'sekitar',
 'bagi',
 'serta',
 'di',
 'dari',
 'telah',
 'sebagai',
 'masih',
 'hal',
 'ketika',
 'adalah',
 'itu',
 'dalam',
 'bisa',
 'bahwa',
 'atau',
 'hanya',
 'kita',
 'dengan',
 'akan',
 'juga',
 'ada',
 'mereka',
 'sudah',
 'saya',
 'terhadap',
 'secara',
 'agar',
 'lain',
 'anda',
 'begitu',
 'mengapa',
 'kenapa',
 'yaitu',
 'yakni',
 'daripada',
 'itulah',
 'lagi',
 'maka',
 'tentang',
 'demi',
 'dimana',
 'kemana',
 'pula',
 'sambil',
 'sebelum',
 'sesudah',
 'supaya',
 'guna',
 'kah',
 'pun',
 'sampai',
 'sedangkan',
 'selagi',
 'sementara',
 'tetapi',
 'apakah',
 'kecuali',
 'sebab',
 'selain',
 'seolah',
 'seraya',
 'seterusnya',
 'tanpa',
 'agak',
 'boleh',
 'dapat',
 'dsb',
 'dst',
 'dll',
 'dahulu

In [None]:
# Kata yang ingin dihapus
hapus_sastrawi = ['baik', 'lebih', 'tidak', 'bisa', 'guna', 'tolong', 'belum', 'dapat', 'ok']

In [None]:
# Filter kata-kata yang tidak ingin disimpan
filtered_words_sastrawi = list(filter(lambda x: x in hapus_sastrawi, stopwords_sastrawi))

# Mengubah list hasil filter menjadi set
filtered_set_sastrawi = set(filtered_words_sastrawi)

# Menghapus kata-kata yang tidak diinginkan dari list stopword
stopwords_sastrawi_fix = list(set(stopwords_sastrawi) - filtered_set_sastrawi)

In [None]:
stopwords_sastrawi_fix

['pula',
 'kepada',
 'kami',
 'ya',
 'seperti',
 'atau',
 'saja',
 'hanya',
 'tapi',
 'dst',
 'nanti',
 'dengan',
 'dahulu',
 'karena',
 'para',
 'antara',
 'pun',
 'mereka',
 'seterusnya',
 'kecuali',
 'kembali',
 'sebetulnya',
 'ketika',
 'serta',
 'sebab',
 'lain',
 'anu',
 'amat',
 'secara',
 'sekitar',
 'ingin',
 'ini',
 'nggak',
 'sebagai',
 'pada',
 'oleh',
 'mengapa',
 'sebelum',
 'pasti',
 'begitu',
 'seolah',
 'demikian',
 'setiap',
 'itulah',
 'lagi',
 'dari',
 'dalam',
 'dll',
 'masih',
 'mari',
 'anda',
 'dan',
 'saya',
 'selagi',
 'sementara',
 'setidaknya',
 'tanpa',
 'agak',
 'apalagi',
 'seraya',
 'sehingga',
 'dsb',
 'untuk',
 'tentang',
 'seharusnya',
 'yaitu',
 'selain',
 'dimana',
 'toh',
 'itu',
 'bagi',
 'demi',
 'sambil',
 'melainkan',
 'agar',
 'juga',
 'ada',
 'yakni',
 'yang',
 'ia',
 'kemana',
 'setelah',
 'oh',
 'bagaimanapun',
 'bahwa',
 'dua',
 'ke',
 'namun',
 'daripada',
 'boleh',
 'walau',
 'dia',
 'sampai',
 'jika',
 'sesuatu',
 'terhadap',
 'di',
 't

In [None]:
stopwords_nltk

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [None]:
# kata yang ingin dihapus
hapus_nltk = ['baik', 'guna', 'kurang', 'lebih', 'mampu', 'siap', 'jelas', 'masih', 'benar', 'semakin', 'cukup', 'makin', 'terus', 'luar', 'boleh', 'tetap', 'belum',
              'diperlukan', 'kecil', 'amat', 'sebaik', 'penting', 'pantas', 'tidak']

In [None]:
# Filter kata-kata yang tidak ingin disimpan
filtered_words_nltk = list(filter(lambda x: x in hapus_nltk, stopwords_nltk))

# Mengubah list hasil filter menjadi set
filtered_set_nltk = set(filtered_words_nltk)

# Menghapus kata-kata yang tidak diinginkan dari list stopword
stopwords_nltk_fix = list(set(stopwords_nltk) - filtered_set_nltk)

In [None]:
stopwords_nltk_fix

['atau',
 'lainnya',
 'terakhir',
 'saja',
 'bermacam',
 'dengan',
 'adapun',
 'digunakan',
 'melalui',
 'karena',
 'katakan',
 'sesudahnya',
 'ditunjuknya',
 'sebut',
 'ditujukan',
 'dimungkinkan',
 'menanti-nanti',
 'sebetulnya',
 'antaranya',
 'seingat',
 'disini',
 'bisa',
 'disebut',
 'diingatkan',
 'tiga',
 'mempersoalkan',
 'aku',
 'sekalipun',
 'mula',
 'selama-lamanya',
 'belakangan',
 'begitu',
 'rupanya',
 'demikian',
 'naik',
 'memberi',
 'dalam',
 'seolah-olah',
 'ungkap',
 'diantara',
 'beginian',
 'karenanya',
 'lanjutnya',
 'selamanya',
 'ucap',
 'semasih',
 'pihak',
 'meminta',
 'seharusnya',
 'sedemikian',
 'mengerjakan',
 'bagi',
 'itu',
 'toh',
 'andalah',
 'kala',
 'sesegera',
 'bagaimanakah',
 'melihat',
 'persoalan',
 'enggak',
 'bila',
 'selama',
 'biasa',
 'jumlahnya',
 'berjumlah',
 'lima',
 'bakal',
 'kelamaan',
 'sebutlah',
 'setibanya',
 'dimisalkan',
 'berapalah',
 'semaunya',
 'belumlah',
 'dipertanyakan',
 'dimulai',
 'sebabnya',
 'memang',
 'terhadap',


In [None]:
# Stopword Tambahan
add_stopwords = ['jkn', 'mobile', 'aplikasi', 'bpjs', 'men', 'ke', 'admin', 'gan', 'sih', 'deh', 'men', 'bos', 'bro', 'woi', 'euy', 'nya', 'dg', 'yah', 'huh', 'lur', 'km', 'ayo', 'bang', 'beehhh', 'by', 'ce',
                 'chuaks', 'ckcck', 'coy', 'cuy', 'di', 'dih', 'diilp7l0pp00kkk', 'doang', 'duh', 'eh', 'ha', 'hai', 'halah', 'halo', 'hehe', 'hei', 'kan', 'kakak', 'slebew',
                 'aach', 'agam', 'ah', 'an', 'anjay', 'bas', 'cc', 'dah', 'diilplppkkk', 'dpkedcek', 'edescjam', 'ezp', 'fd', 'ff', 'fiuh', 'hadeh', 'hah', 'hahaha', 'hmm',
                 'huft', 'huwaaaaaa', 'ikppp', 'kakak', 'kawakora', 'kes', 'ku', 'mah', 'mamayo', 'minth', 'nge', 'nih', 'nxwwwwx', 'oalahh', 'odi', 'papayooo', 'pelotin',
                 'pga', 'pp', 'ppp', 'pppp', 'pppppppppp', 'ribu', 'rrp', 'saa', 'selekedep', 'sz', 'tah', 'uyeah', 'waahh', 'wae', 'we', 'wkwkwk', 'wkwkwkw', 'woii', 'woy',
                 'ww', 'wwwww', 'wwwwwwswawwww', 'wwwwwwwwww', 'aco', 'gue', 'lord', '&amp']

In [None]:
# Menggabungkan daftar stopword dari Sastrawi dan NLTK
list_stopwords = list(set(stopwords_nltk_fix + stopwords_sastrawi_fix + add_stopwords))

In [None]:
list_stopwords

['terakhir',
 'lainnya',
 'atau',
 'saja',
 'bermacam',
 'dengan',
 'adapun',
 'digunakan',
 'melalui',
 'karena',
 'katakan',
 'sesudahnya',
 'ku',
 'ditunjuknya',
 'sebut',
 'ditujukan',
 'dimungkinkan',
 'menanti-nanti',
 'sebetulnya',
 'antaranya',
 'seingat',
 'anu',
 'disini',
 'bisa',
 'disebut',
 'diingatkan',
 'tiga',
 'mempersoalkan',
 'aku',
 'sekalipun',
 'mula',
 'selama-lamanya',
 'kakak',
 'belakangan',
 'begitu',
 'rupanya',
 'demikian',
 'naik',
 'memberi',
 'dalam',
 'dll',
 'seolah-olah',
 'ungkap',
 'diantara',
 'beginian',
 'karenanya',
 'lanjutnya',
 'selamanya',
 'ucap',
 'semasih',
 'pihak',
 'meminta',
 'seharusnya',
 'sedemikian',
 'toh',
 'itu',
 'mengerjakan',
 'bagi',
 'andalah',
 'euy',
 'sih',
 'kala',
 'sesegera',
 'bagaimanakah',
 'woi',
 'melihat',
 'lord',
 'persoalan',
 'doang',
 'wkwkwk',
 'enggak',
 'bila',
 'selama',
 'biasa',
 'ww',
 'jumlahnya',
 'berjumlah',
 'lima',
 'bakal',
 'kelamaan',
 'sebutlah',
 'setibanya',
 'dimisalkan',
 'berapalah',

In [None]:
# Hapus stopwords pada List Token
def stopword_removal(words):
    return[word for word in words if word not in list_stopwords]

df_ulasan['Ulasan_Tokenization'] = df_ulasan['Ulasan_Tokenization'].apply(stopword_removal)

print(df_ulasan['Ulasan_Tokenization'].head(10))

0                       [terima, kasih]
1                               [keren]
2                               [bantu]
3                               [bantu]
4                                [baik]
5              [mantap, tidak, praktis]
6    [masuk, jam, akses, tidak, sesuai]
7                               [tidak]
8                               [bagus]
9                          [layan, oke]
Name: Ulasan_Tokenization, dtype: object


### **TF-IDF Normalisasi**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [None]:
# Inisialisasi pembobot kata dengan TF-IDF
vectorizer = TfidfVectorizer()

In [None]:
# Mengubah tipe data kolom Ulasan_Tokenization dari 'object' menjadi 'string'
df_ulasan['Ulasan_Tokenization'] = df_ulasan['Ulasan_Tokenization'].astype(str)

In [None]:
# Menggabungkan isi list dengan spasi sebagai separator
text = ' '.join(df_ulasan['Ulasan_Tokenization'])

In [None]:
# Pembobotan kata pada data latih
tfidf = vectorizer.fit_transform(df_ulasan['Ulasan_Tokenization'])

In [None]:
# Normalisasi tfidf
tfidf_normalized = normalize(tfidf)

### **Pembagian Data**

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
X = tfidf_normalized.toarray()
print(X)
len(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


5259

In [None]:
# Mengubah variabel target menjadi categorical
label_encoder = LabelEncoder()
df_ulasan["Sentimen"] = label_encoder.fit_transform(df_ulasan["Sentimen"])

#Mengubah dataframe menjadi array
y = np.array(df_ulasan['Sentimen'])

print(y)

[1 1 1 ... 1 1 1]


In [None]:
# Memisahkan data menjadi data training dan data testing dengan 10-fold cross validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

### **Oversampling**

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

### **SVM Linear**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
# initialize variables to store results
accuracies = []
auc_scores  = []
conf_matrices = []
train_conf_matrices = []
train_accuracies = []
train_auc_scores = []

In [None]:
# set random seed
np.random.seed(42)

# Loop through all folds for training
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_linear = SVC(kernel = 'linear', C = 10)
    svm_linear.fit(X_train_resampled, y_train_resampled)

    # Predict on the training data
    y_train_pred = svm_linear.predict(X_train)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_train, y_train_pred)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    train_accuracies.append(accuracy)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_train, y_train_pred)
    train_auc_scores.append(auc)

    # store confusion matrix for this fold
    train_conf_matrices.append(conf_matrix)

    # print results for this fold
    print(f"Fold {i+1} - Training Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")

    # print confusion matrix for this fold
    print(f"Training Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_train_accuracy = np.mean(train_accuracies)
mean_train_auc = np.mean(train_auc_scores)

# print mean results
print(f"Mean - Training Accuracy: {mean_train_accuracy:.4f}, AUC: {mean_train_auc:.4f}")

# calculate sum of all confusion matrices
sum_train_conf_matrix = np.zeros((2, 2))
for matrix in train_conf_matrices:
    sum_train_conf_matrix += matrix

# calculate mean confusion matrix
mean_train_conf_matrix = sum_train_conf_matrix / len(train_conf_matrices)

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_train_conf_matrix.astype(int))

Fold 1 - Training Accuracy: 0.9632, AUC: 0.9657
Training Confusion Matrix for Fold 1:
[[1047   32]
 [ 142 3512]]
Fold 2 - Training Accuracy: 0.9632, AUC: 0.9661
Training Confusion Matrix for Fold 2:
[[1047   31]
 [ 143 3512]]
Fold 3 - Training Accuracy: 0.9651, AUC: 0.9673
Training Confusion Matrix for Fold 3:
[[1047   31]
 [ 134 3521]]
Fold 4 - Training Accuracy: 0.9647, AUC: 0.9667
Training Confusion Matrix for Fold 4:
[[1046   32]
 [ 135 3520]]
Fold 5 - Training Accuracy: 0.9641, AUC: 0.9676
Training Confusion Matrix for Fold 5:
[[1050   28]
 [ 142 3513]]
Fold 6 - Training Accuracy: 0.9618, AUC: 0.9658
Training Confusion Matrix for Fold 6:
[[1049   29]
 [ 152 3503]]
Fold 7 - Training Accuracy: 0.9613, AUC: 0.9661
Training Confusion Matrix for Fold 7:
[[1051   27]
 [ 156 3499]]
Fold 8 - Training Accuracy: 0.9643, AUC: 0.9674
Training Confusion Matrix for Fold 8:
[[1049   29]
 [ 140 3515]]
Fold 9 - Training Accuracy: 0.9626, AUC: 0.9653
Training Confusion Matrix for Fold 9:
[[1046   3

In [None]:
# set random seed
np.random.seed(42)

# Loop through all folds for testing
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_linear = SVC(kernel = 'linear', C = 10)
    svm_linear.fit(X_train_resampled, y_train_resampled)

    # Predict on the testing data
    y_pred = svm_linear.predict(X_test)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    accuracies.append(accuracy)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

    # store confusion matrix for this fold
    conf_matrices.append(conf_matrix)

    # print results for this fold
    print(f"Fold {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")

    # print confusion matrix for this fold
    print(f"Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_accuracy = np.mean(accuracies)
mean_auc = np.mean(auc_scores)

# calculate mean confusion matrix
mean_conf_matrix = np.mean(conf_matrices, axis=0)

# print mean results
print(f"Mean - Accuracy: {mean_accuracy:.4f}, AUC: {mean_auc:.4f}")

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_conf_matrix.astype(int))

Fold 1 - Accuracy: 0.9030, AUC: 0.8987
Confusion Matrix for Fold 1:
[[106  13]
 [ 38 369]]
Fold 2 - Accuracy: 0.9087, AUC: 0.8998
Confusion Matrix for Fold 2:
[[106  14]
 [ 34 372]]
Fold 3 - Accuracy: 0.9087, AUC: 0.9027
Confusion Matrix for Fold 3:
[[107  13]
 [ 35 371]]
Fold 4 - Accuracy: 0.9049, AUC: 0.8944
Confusion Matrix for Fold 4:
[[105  15]
 [ 35 371]]
Fold 5 - Accuracy: 0.9106, AUC: 0.9098
Confusion Matrix for Fold 5:
[[109  11]
 [ 36 370]]
Fold 6 - Accuracy: 0.9240, AUC: 0.9214
Confusion Matrix for Fold 6:
[[110  10]
 [ 30 376]]
Fold 7 - Accuracy: 0.8916, AUC: 0.8858
Confusion Matrix for Fold 7:
[[105  15]
 [ 42 364]]
Fold 8 - Accuracy: 0.9011, AUC: 0.8919
Confusion Matrix for Fold 8:
[[105  15]
 [ 37 369]]
Fold 9 - Accuracy: 0.9106, AUC: 0.9098
Confusion Matrix for Fold 9:
[[109  11]
 [ 36 370]]
Fold 10 - Accuracy: 0.9105, AUC: 0.9124
Confusion Matrix for Fold 10:
[[109  10]
 [ 37 369]]
Mean - Accuracy: 0.9074, AUC: 0.9027
Mean Confusion Matrix:
[[107  12]
 [ 36 370]]


In [None]:
# set random seed
np.random.seed(42)
from collections import Counter

# Loop through all folds for testing
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_linear = SVC(kernel = 'linear', C = 10)
    svm_linear.fit(X_train_resampled, y_train_resampled)

    # Predict on the testing data
    y_pred = svm_linear.predict(X_test)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    accuracies.append(accuracy)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

    # store confusion matrix for this fold
    conf_matrices.append(conf_matrix)

    # print results for this fold
    print(f"Fold {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
    print("Jumlah data latih sebelum SMOTE:", len(X_train), len(y_train))
    print("Jumlah data positif dan negatif sebelum SMOTE:")
    print(Counter(y_train))
    print("Jumlah data latih setelah SMOTE:", len(X_train_resampled), len(y_train_resampled))
    print("Jumlah data positif dan negatif setelah SMOTE:")
    print(Counter(y_train_resampled))
    print("Jumlah data uji:", len(X_test), len(y_test))
    print("Jumlah data positif dan negatif pada data uji:")
    print(Counter(y_test))

    # print confusion matrix for this fold
    print(f"Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_accuracy = np.mean(accuracies)
mean_auc = np.mean(auc_scores)

# calculate mean confusion matrix
mean_conf_matrix = np.mean(conf_matrices, axis=0)

# print mean results
print(f"Mean - Accuracy: {mean_accuracy:.4f}, AUC: {mean_auc:.4f}")

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_conf_matrix.astype(int))

Fold 1 - Accuracy: 0.9030, AUC: 0.8987
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3654, 0: 1079})
Jumlah data latih setelah SMOTE: 7308 7308
Jumlah data positif dan negatif setelah SMOTE:
Counter({1: 3654, 0: 3654})
Jumlah data uji: 526 526
Jumlah data positif dan negatif pada data uji:
Counter({1: 407, 0: 119})
Confusion Matrix for Fold 1:
[[106  13]
 [ 38 369]]
Fold 2 - Accuracy: 0.9087, AUC: 0.8998
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3655, 0: 1078})
Jumlah data latih setelah SMOTE: 7310 7310
Jumlah data positif dan negatif setelah SMOTE:
Counter({1: 3655, 0: 3655})
Jumlah data uji: 526 526
Jumlah data positif dan negatif pada data uji:
Counter({1: 406, 0: 120})
Confusion Matrix for Fold 2:
[[106  14]
 [ 34 372]]
Fold 3 - Accuracy: 0.9087, AUC: 0.9027
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3655, 0: 

In [None]:
sv_list = []
intercept_list = []

#Inisialisasi model SVM
svm_linear = SVC(kernel='linear', C = 10)

# Melakukan validasi silang dan melatih model pada setiap lipatan
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Melatih model pada data training
    svm_linear.fit(X_train_resampled, y_train_resampled)

   # Menyimpan nilai koefisien dan intersep pada list
    sv_list.append(svm_linear.support_vectors_.shape[0])
    intercept_list.append(svm_linear.intercept_)

# Menghitung rata-rata koefisien dan intersep
avg_sv = np.mean(sv_list, axis=0)
avg_intercept = np.mean(intercept_list, axis=0)

# Mencetak rata-rata koefisien dan intersep
print("Rata-rata Support Vector: ", avg_sv.astype(int))
print("Rata-rata Intersep: ", avg_intercept)

Rata-rata Support Vector:  1278
Rata-rata Intersep:  [-0.99946753]


### **SVM RBF Gamma 1**

In [None]:
# set random seed
np.random.seed(42)

# Loop through all folds for training
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_rbf3 = SVC(kernel = 'rbf', C = 10, gamma = 1)
    svm_rbf3.fit(X_train_resampled, y_train_resampled)

    # Predict on the training data
    y_train_pred = svm_rbf3.predict(X_train)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_train, y_train_pred)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    train_accuracies.append(accuracy)
    train_precisions.append(precision)
    train_recalls.append(recall)
    train_f1_scores.append(f1_score)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_train, y_train_pred)
    train_auc_scores.append(auc)

    # store confusion matrix for this fold
    train_conf_matrices.append(conf_matrix)

    # print results for this fold
    print(f"Fold {i+1} - Training Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, AUC: {auc:.4f}, F1 Score: {f1_score:.4f}")

    # print confusion matrix for this fold
    print(f"Training Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_train_accuracy = np.mean(train_accuracies)
mean_train_precision = np.mean(train_precisions)
mean_train_recall = np.mean(train_recalls)
mean_train_auc = np.mean(train_auc_scores)
mean_train_f1 = np.mean(train_f1_scores)

# print mean results
print(f"Mean - Training Accuracy: {mean_train_accuracy:.4f}, Precision: {mean_train_precision:.4f}, Recall: {mean_train_recall:.4f}, AUC: {mean_train_auc:.4f}, F1 Score: {mean_train_f1:.4f}")

# calculate sum of all confusion matrices
sum_train_conf_matrix = np.zeros((2, 2))
for matrix in train_conf_matrices:
    sum_train_conf_matrix += matrix

# calculate mean confusion matrix
mean_train_conf_matrix = sum_train_conf_matrix / len(train_conf_matrices)

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_train_conf_matrix.astype(int))

Fold 1 - Training Accuracy: 0.9772, Precision: 0.9939, Recall: 0.9765, AUC: 0.9780, F1 Score: 0.9851
Training Confusion Matrix for Fold 1:
[[1057   22]
 [  86 3568]]
Fold 2 - Training Accuracy: 0.9780, Precision: 0.9950, Recall: 0.9765, AUC: 0.9799, F1 Score: 0.9856
Training Confusion Matrix for Fold 2:
[[1060   18]
 [  86 3569]]
Fold 3 - Training Accuracy: 0.9776, Precision: 0.9944, Recall: 0.9765, AUC: 0.9790, F1 Score: 0.9854
Training Confusion Matrix for Fold 3:
[[1058   20]
 [  86 3569]]
Fold 4 - Training Accuracy: 0.9784, Precision: 0.9939, Recall: 0.9781, AUC: 0.9789, F1 Score: 0.9859
Training Confusion Matrix for Fold 4:
[[1056   22]
 [  80 3575]]
Fold 5 - Training Accuracy: 0.9774, Precision: 0.9947, Recall: 0.9759, AUC: 0.9791, F1 Score: 0.9852
Training Confusion Matrix for Fold 5:
[[1059   19]
 [  88 3567]]
Fold 6 - Training Accuracy: 0.9787, Precision: 0.9942, Recall: 0.9781, AUC: 0.9793, F1 Score: 0.9861
Training Confusion Matrix for Fold 6:
[[1057   21]
 [  80 3575]]
Fold

In [None]:
# set random seed
np.random.seed(42)

# Loop through all folds for testing
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_rbf = SVC(kernel = 'rbf', C = 10, gamma = 1)
    svm_rbf.fit(X_train_resampled, y_train_resampled)

    # Predict on the testing data
    y_pred = svm_rbf.predict(X_test)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    accuracies.append(accuracy)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

    # store confusion matrix for this fold
    conf_matrices.append(conf_matrix)

    # print results for this fold
    print(f"Fold {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
    print("Jumlah data latih sebelum SMOTE:", len(X_train), len(y_train))
    print("Jumlah data positif dan negatif sebelum SMOTE:")
    print(Counter(y_train))
    print("Jumlah data latih setelah SMOTE:", len(X_train_resampled), len(y_train_resampled))
    print("Jumlah data positif dan negatif setelah SMOTE:")
    print(Counter(y_train_resampled))
    print("Jumlah data uji:", len(X_test), len(y_test))
    print("Jumlah data positif dan negatif pada data uji:")
    print(Counter(y_test))

    # print confusion matrix for this fold
    print(f"Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_accuracy = np.mean(accuracies)
mean_auc = np.mean(auc_scores)

# calculate mean confusion matrix
mean_conf_matrix = np.mean(conf_matrices, axis=0)

# print mean results
print(f"Mean - Accuracy: {mean_accuracy:.4f}, AUC: {mean_auc:.4f}")

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_conf_matrix.astype(int))

Fold 1 - Accuracy: 0.9335, AUC: 0.9154
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3654, 0: 1079})
Jumlah data latih setelah SMOTE: 7308 7308
Jumlah data positif dan negatif setelah SMOTE:
Counter({1: 3654, 0: 3654})
Jumlah data uji: 526 526
Jumlah data positif dan negatif pada data uji:
Counter({1: 407, 0: 119})
Confusion Matrix for Fold 1:
[[105  14]
 [ 21 386]]
Fold 2 - Accuracy: 0.9278, AUC: 0.9150
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3655, 0: 1078})
Jumlah data latih setelah SMOTE: 7310 7310
Jumlah data positif dan negatif setelah SMOTE:
Counter({1: 3655, 0: 3655})
Jumlah data uji: 526 526
Jumlah data positif dan negatif pada data uji:
Counter({1: 406, 0: 120})
Confusion Matrix for Fold 2:
[[107  13]
 [ 25 381]]
Fold 3 - Accuracy: 0.9144, AUC: 0.8888
Jumlah data latih sebelum SMOTE: 4733 4733
Jumlah data positif dan negatif sebelum SMOTE:
Counter({1: 3655, 0: 

In [None]:
# set random seed
np.random.seed(42)

# Loop through all folds for testing
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Resample the training data
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

    # Train the model on the resampled data
    svm_rbf3 = SVC(kernel = 'rbf', C = 10, gamma = 1)
    svm_rbf3.fit(X_train_resampled, y_train_resampled)

    # Predict on the testing data
    y_pred = svm_rbf3.predict(X_test)

    # calculate confusion matrix and store for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)

    # store confusion matrix for this fold
    conf_matrices.append(conf_matrix)

    # calculate precision, recall, and f1-score and store for this fold
    true_positives = conf_matrix[1, 1]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_negatives = conf_matrix[0, 0]
    accuracy = (true_positives + true_negatives)/(true_positives + false_positives + false_negatives + true_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    specificity = true_negatives / (true_negatives + false_positives)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)
    specificities.append(specificity)

    # calculate AUC and store for this fold
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

    # print results for this fold
    print(f"Fold {i+1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, AUC: {auc:.4f}, F1 Score: {f1_score:.4f}, Spesicity: {specificity:.4f}")

    # print confusion matrix for this fold
    print(f"Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

# calculate mean metrics
mean_accuracy = np.mean(accuracies)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)
mean_auc = np.mean(auc_scores)
mean_f1 = np.mean(f1_scores)

# print mean results
print(f"Mean - Accuracy: {mean_accuracy:.4f}, Precision: {mean_precision:.4f}, Recall: {mean_recall:.4f}, AUC: {mean_auc:.4f}, F1 Score: {mean_f1:.4f}")

# calculate sum of all confusion matrices
total=sum(sum(conf_matrix))

for i, conf_matrix in enumerate(confusion_matrices):
    print('Confusion Matrix', i+1,':\n', conf_matrix)

sum_conf_matrix = np.zeros((2, 2))
for matrix in conf_matrices:
    sum_conf_matrix += matrix

# calculate mean confusion matrix
mean_conf_matrix = sum_conf_matrix / len(conf_matrices)

# print mean confusion matrix
print("Mean Confusion Matrix:")
print(mean_conf_matrix.astype(int))

Fold 1 - Accuracy: 0.9335, Precision: 0.9650, Recall: 0.9484, AUC: 0.9154, F1 Score: 0.9566
Confusion Matrix for Fold 1:
[[105  14]
 [ 21 386]]
Fold 2 - Accuracy: 0.9278, Precision: 0.9670, Recall: 0.9384, AUC: 0.9150, F1 Score: 0.9525
Confusion Matrix for Fold 2:
[[107  13]
 [ 25 381]]
Fold 3 - Accuracy: 0.9144, Precision: 0.9524, Recall: 0.9360, AUC: 0.8888, F1 Score: 0.9441
Confusion Matrix for Fold 3:
[[101  19]
 [ 26 380]]
Fold 4 - Accuracy: 0.9144, Precision: 0.9479, Recall: 0.9409, AUC: 0.8829, F1 Score: 0.9444
Confusion Matrix for Fold 4:
[[ 99  21]
 [ 24 382]]
Fold 5 - Accuracy: 0.9259, Precision: 0.9646, Recall: 0.9384, AUC: 0.9109, F1 Score: 0.9513
Confusion Matrix for Fold 5:
[[106  14]
 [ 25 381]]
Fold 6 - Accuracy: 0.9278, Precision: 0.9646, Recall: 0.9409, AUC: 0.9121, F1 Score: 0.9526
Confusion Matrix for Fold 6:
[[106  14]
 [ 24 382]]
Fold 7 - Accuracy: 0.9297, Precision: 0.9671, Recall: 0.9409, AUC: 0.9163, F1 Score: 0.9538
Confusion Matrix for Fold 7:
[[107  13]
 [ 2

In [None]:
# initialize variables to store results
accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores  = []
conf_matrices = []
train_accuracies = []
train_precisions = []
train_recalls = []
train_f1_scores = []
train_auc_scores  = []
train_conf_matrices = []