In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [2]:
sw_indo = stopwords.words('indonesian') + list(punctuation)

# Import Data

In [3]:
df = pd.read_csv("data/spam.csv")
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


# Dataset Splitting

In [4]:
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])

# Hyperparameter tuning

In [6]:
parameters = {
    "algo__fit_intercept" : [True,False],
    "algo__C" : range(1,3,1)
}

# Training

In [7]:
model = GridSearchCV(pipeline,parameters,cv=3,n_jobs=-1,verbose=1)
model.fit(X_train,y_train)
model.score(X_train,y_train),model.best_score_,model.score(X_test,y_test)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    8.0s finished
  'stop_words.' % sorted(inconsistent))


(0.9976662777129521, 0.9626548889706784, 0.9545454545454546)

# Predict

In [8]:
X_predict = [
    ["Permisi,Bpk/Ibu Butuh Dana Ingin KEMBANGKAN USAHA & Kebutuhan Lain Mulai 5jt S?D 500Jt Bunga 2% Proses Mudah Minat Whatsapp: 085348331010"],
    ["Selamat 2000 IMpoin dri sis ulang pulsa BERHASIL msk ke akunmu. You've earned 2000 IMPoin. Valid until 31-12-2021: Redeem m/attractive rewards: bit.l/imnm3y"]
]

In [9]:
model.predict_proba(X_predict[1])

array([[0.26821394, 0.73178606]])