In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RIZKI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

#Load Dataset

In [4]:
data = pd.read_csv('dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


## Text Preprocessing


1. CASE FOLDING

In [7]:
import re

#membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower() #merubah text jadi huruf kecil
    text = re.sub(r'https?://\s+|www\.\s+', '', text) #menghapus url dari dataset text
    text = re.sub(r'[-+]?[0-9]+', '', text) #menghapus angka dari dataset
    text = re.sub(r'[^\w\s]', '', text) #menghapus simbol pada dataset
    text = text.strip()
    return text

In [8]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t : ',raw_sample)
print('Case Folding\t :',case_folding)

Raw data	 :  2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 : plg yth sisa kuota flash anda kb download mytelkomsel apps di httptselmetsel utk cek kuotabeli paket flash atau hub


# WORD NORMALIZATION

In [9]:
key_norm = pd.read_csv('key_norm.csv')

#merubah kata di kolom singkat menjadi ke kolom hasil
def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any()
    else word for word in text.split()
    ])

    text = str.lower(text)
    return text

In [10]:
#membandingkah before dan after word normalization

raw_data = data['teks'].iloc[696]
word_normal = text_normalize(case_folding)

print('Raw Data\t :',raw_data)
print('Word Normalize\t :', word_normal)

Raw Data	 : Btw magicomnya yg sedang Gais, gaada yg gede
Word Normalize	 : pulang yang terhormat sisa kuota flash anda kb download mytelkomsel apps di httptselmetsel untuk cek kuotabeli paket flash atau hubungi


## Filtering(Stopword Removal)

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')

In [12]:
len(stopwords_ind) #mengambil kata

758

In [13]:
# melihat daftar kata stopword dari nltk
stopwords_ind

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [14]:
#menambahkan kata ke dalam stopword
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ind = stopwords_ind + more_stopword

#membuat fungsi stopword removal
def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)


In [15]:
#before after
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :',raw_data)
print('Case Folding \t\t :',case_folding)
print('Stopword Removal \t\t',stopwords_removal)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stopword Removal 		 magicomnya yg gais gaada yg gede


##STEMMING

In [16]:
!pip -q install sastrawi

In [18]:
#merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

#membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [19]:
#contoh before dan after
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopwords_removal)

print('Raw Data \t\t :',raw_sample)
print('Case Folding \t\t :',case_folding)
print('Stop Word Removal \t\t :', stopwords_removal)
print('Stemming \t\t :',text_stemming)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stop Word Removal 		 : magicomnya yg gais gaada yg gede
Stemming 		 : magicomnya yg gais gaada yg gede


## TEXT PREPROCESSING PIPELINE

In [20]:
#membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text

In [21]:
%%time
data['clean_teks']= data['teks'].apply(text_preprocessing_process)

Wall time: 3min 58s


In [22]:
#melihat data yang sudah di clean data
data

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash my telkomsel app extra ...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,rupiah ribu spesial pilih aktif promo sd novem...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,pulang hormat sisa kuota flash kb download myt...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,pulang hormat sisa kuota flash kb download myt...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [23]:
# menyimpan data yang sudah diproprocessing ke dalam file csv
data.to_csv('clean_data.csv')

## FEATURE ENGINEERING

In [24]:
#pisahkan kolom feature dan target
x = data['clean_teks']
y = data['label']

In [25]:
x

0       promo beli paket flash my telkomsel app extra ...
1       rupiah ribu spesial pilih aktif promo sd novem...
2       pulang hormat sisa kuota flash kb download myt...
3       pulang hormat sisa kuota flash kb download myt...
4                rupiah ribu spesial pilih aktif buru skb
                              ...                        
1138                           yooo oke umumin grup kelas
1139                     nulis kerudung kirain warna jins
1140                                        mbak kirim ya
1141        nama beaok bwrangkat pagimau cas atay tranfer
1142                                       nomor bri nama
Name: clean_teks, Length: 1143, dtype: object

In [26]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1138    0
1139    0
1140    0
1141    0
1142    0
Name: label, Length: 1143, dtype: int64

## FEATURE EXTRACTION (TF-IDF DAN N-GRAM)

In [27]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [28]:
# menampilkan vocabulary(kata-kata) dari tf-idf
vec_TF_IDF.vocabulary_

{'promo': 2343,
 'beli': 323,
 'paket': 2136,
 'flash': 873,
 'my': 1928,
 'telkomsel': 2923,
 'app': 162,
 'extra': 843,
 'kuota': 1597,
 'lte': 1700,
 'telpon': 2926,
 'mnthr': 1879,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3061,
 'sk': 2739,
 'rupiah': 2551,
 'ribu': 2500,
 'spesial': 2798,
 'pilih': 2223,
 'aktif': 66,
 'sd': 2605,
 'november': 2037,
 'pulang': 2378,
 'hormat': 1119,
 'sisa': 2732,
 'kb': 1445,
 'download': 754,
 'mytelkomsel': 1930,
 'apps': 167,
 'httptselmetsel': 1158,
 'kuotabeli': 1598,
 'hubung': 1171,
 'skb': 2740,
 'ekstra': 806,
 'pulsa': 2380,
 'internet': 1253,
 'bulan': 466,
 'sjk': 2738,
 'augsept': 217,
 'detail': 665,
 'iring': 1275,
 'tarif': 2889,
 'panjang': 2149,
 'hits': 1108,
 'armada': 180,
 'curi': 600,
 'hati': 1072,
 'tekan': 2917,
 'okcall': 2089,
 'informasi': 1223,
 'eks': 803,
 'loh': 1687,
 'internetan': 1254,
 'pakai': 2134,
 'volume': 3177,
 'ultima': 3114,
 'mbhr': 1786,
 'harga': 1060,
 'tariflokasi': 2891,
 'tselmefl': 3059,
 '

In [29]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names()))

3445




In [30]:
# melihat feature apasaja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names())

['aa', 'aamiiiin', 'aamiin', 'ab', 'abadi', 'abai', 'abbee', 'abdul', 'acara', 'acaratks', 'account', 'ada', 'adapromo', 'adi', 'adik', 'adison', 'admin', 'administrasi', 'adminlte', 'ado', 'adrian', 'adu', 'aduh', 'advertising', 'aea', 'aesthetic', 'afbe', 'affc', 'afr', 'afrika', 'agam', 'agen', 'agendain', 'agenpulsa', 'ags', 'agst', 'agsts', 'agt', 'agtskinfodlj', 'agua', 'agun', 'agus', 'agust', 'agustus', 'agustuskunjungi', 'ah', 'ahaha', 'ahub', 'aidzin', 'aigoo', 'air', 'aja', 'ajaa', 'ajaaa', 'ajabri', 'ajak', 'ajakin', 'ajar', 'ajeng', 'akademik', 'akang', 'akangteteh', 'akbar', 'akreditasi', 'akses', 'aksi', 'aktif', 'aktifasi', 'aktivasi', 'aktivitas', 'akucintaislam', 'akumulasi', 'akun', 'akurasi', 'akurat', 'ala', 'alaikum', 'alaikumsaya', 'alaiqum', 'alam', 'alamat', 'alami', 'alamsyah', 'alat', 'alesannya', 'algoritma', 'alhamdulillah', 'alhamdullilah', 'alhuda', 'ali', 'aliando', 'all', 'allah', 'allahaamiin', 'alphard', 'alquran', 'alur', 'aman', 'amanda', 'ambil', '



In [31]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names())
data_tabular_tf_idf



Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,akang,akangteteh,akbar,akreditasi,akses,aksi,aktif,aktifasi,aktivasi,aktivitas
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.149201,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.262305,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.244053,0.0,0.382416,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## FEATURE SELECTION

In [33]:
#mengubah data menjadi array
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [34]:
#library chi-square
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

#untuk reduced features
print('Original Features Number', x_train.shape[1])
print('Reduced Feature Number', x_kbest_features.shape[1])
#semakin tinggi nilai semakin baik

Original Features Number 3445
Reduced Feature Number 3000


In [35]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.843018
1,0.419698
2,1.558607
3,0.686416
4,0.759870
...,...
3440,1.126664
3441,0.503012
3442,0.686416
3443,2.917329


In [36]:
#menampilkan data fitur beserta nilainya
feature = vec_TF_IDF.get_feature_names()
feature

Data['Fitur'] = feature
Data



Unnamed: 0,Nilai,Fitur
0,0.843018,aa
1,0.419698,aamiiiin
2,1.558607,aamiin
3,0.686416,ab
4,0.759870,abadi
...,...,...
3440,1.126664,zalora
3441,0.503012,zarkasi
3442,0.686416,zjt
3443,2.917329,zona


In [37]:
#mengurutkan nilai fitur terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
2136,48.850264,paket
1597,44.800152,kuota
1033,43.549195,hadiah
2226,36.905090,pin
417,33.668785,bonus
...,...,...
1566,0.044910,kopi
307,0.044468,bca
1742,0.031575,maksimal
3169,0.012716,via


In [38]:
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [39]:
#menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi-square
new_feature=[]
for bool, f in zip(mask, feature):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'account',
 'ada',
 'adapromo',
 'adi',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahub',
 'aidzin',
 'aigoo',
 'air',
 'aja',
 'ajaa',
 'ajaaa',
 'ajabri',
 'ajak',
 'ajeng',
 'akang',
 'akbar',
 'akreditasi',
 'akses',
 'aksi',
 'aktif',
 'aktifasi',
 'aktivasi',
 'aktivitas',
 'akucintaislam',
 'akumulasi',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'aliando',
 'all',
 'allah',
 'allahaamiin',
 'alphard',
 'alquran',
 'alur',
 'aman',
 'amanda',
 'ambil',
 'amin',
 'ampuun',
 'an

In [40]:
#membuat vocabulary baru berdasarkan fitur yang terseleksi
new_selected_feature = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'promo': 2343,
 'beli': 323,
 'paket': 2136,
 'flash': 873,
 'my': 1928,
 'telkomsel': 2923,
 'app': 162,
 'extra': 843,
 'kuota': 1597,
 'lte': 1700,
 'telpon': 2926,
 'mnthr': 1879,
 'buru': 480,
 'cek': 521,
 'tselmemytsel': 3061,
 'sk': 2739,
 'rupiah': 2551,
 'ribu': 2500,
 'spesial': 2798,
 'pilih': 2223,
 'aktif': 66,
 'sd': 2605,
 'november': 2037,
 'pulang': 2378,
 'hormat': 1119,
 'sisa': 2732,
 'kb': 1445,
 'download': 754,
 'mytelkomsel': 1930,
 'apps': 167,
 'httptselmetsel': 1158,
 'kuotabeli': 1598,
 'hubung': 1171,
 'skb': 2740,
 'ekstra': 806,
 'pulsa': 2380,
 'internet': 1253,
 'bulan': 466,
 'sjk': 2738,
 'augsept': 217,
 'detail': 665,
 'iring': 1275,
 'tarif': 2889,
 'panjang': 2149,
 'hits': 1108,
 'armada': 180,
 'curi': 600,
 'hati': 1072,
 'tekan': 2917,
 'okcall': 2089,
 'informasi': 1223,
 'eks': 803,
 'loh': 1687,
 'internetan': 1254,
 'pakai': 2134,
 'volume': 3177,
 'ultima': 3114,
 'mbhr': 1786,
 'harga': 1060,
 'tariflokasi': 2891,
 'tselmefl': 3059,
 '

In [41]:
#hasil jumlah fitur setelah diseleksi
len(new_selected_feature)

3000

In [42]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf_idf.sav","wb"))

In [43]:
#menampilkan fitur-fitur yang sudah diseleksi
data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## PEMBUATAN MODELING CLASSIFICATION

In [44]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
#import library
import random
from sklearn.model_selection import train_test_split

#import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

In [46]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [47]:
#menampilkan jumlah data traning dan data testing 80% traning 20%testing
print('Banyaknya x_train : ', len(x_train))
print('Banyaknya x_test : ', len(x_test))
print('Banyaknya y_train : ', len(y_train))
print('Banyaknya y_test : ', len(y_test))

Banyaknya x_train :  914
Banyaknya x_test :  229
Banyaknya y_train :  914
Banyaknya y_test :  229


In [49]:
#proses traning menggunakan naive bayes
text_algorithm = MultinomialNB()

In [50]:
model = text_algorithm.fit(x_train, y_train)

In [51]:
#membuat model prediksi

data_input = ("promo beli paket flash my telkomsel app extra kuota lte extra telpon mnthr buru cek tselmemytsel sk")
data_input = text_preprocessing_process(data_input)

#load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf_idf.sav","rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Promo


## VALIDATION DAN EVALUASI MODEL

In [52]:
#masukan library yang dibutuhkan untuk proses testing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       126
           1       0.92      0.89      0.91        66
           2       0.84      0.84      0.84        37

    accuracy                           0.92       229
   macro avg       0.90      0.90      0.90       229
weighted avg       0.92      0.92      0.92       229



In [54]:
#menyimpan model
pickle.dump(model,open("model_fraud.sav","wb"))