In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [2]:
sw_indo = stopwords.words('indonesian') + list(punctuation)

In [4]:
df = pd.read_csv("dataset_sms_spam_v1.csv")
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


In [5]:
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(tokenizer=word_tokenize,stop_words=sw_indo)),
    ('algo',RandomForestClassifier())
])
parameters = {
    "algo__n_estimators" : [100,150,200],
    "algo__max_depth" : [20,50,80],
    "algo__min_samples_leaf" : [1,5,10],
    "algo__max_features" : [0.3, 0.5, 0.8]
}
model = RandomizedSearchCV(pipeline,parameters,n_iter=50,cv=3,n_jobs=-1,verbose=1)
model.fit(X_train,y_train)
model.score(X_train,y_train),model.best_score_,model.score(X_test,y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.9min finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.8829199022145527, 0.8864628820960698)

In [8]:
X_predict = [
    ["Tetap terhubung dengan kuota Darurat ketik 'Ya' untuk dapatkan dengan seharga Rp7700 sekarang dan bayar nanti pada saat isi ulang. ketik *505# untuklihat pilihan lainnya Yellow 2GB/3hr"],
    ["Anda mendapatkan subsidi pemerintah Rp 189.000.000 PIN PEMENANG 717747R buka bit.ly/my-pertamina2527 wa 081233211753"]
]

In [17]:
label = model.predict(X_predict[1])

In [18]:
if label == 0:
    print("SMS Normal")
elif label == 1:
    print("SMS Penipuan")
elif label == 2:
    print("SMS promo")
else:
    print("Tidak Termasuk Kategori")

SMS Penipuan
