<a href="https://colab.research.google.com/github/rizki-putra-saimona-armen/XGBoost_Classifier_Heart_Attack_Prediction_Indonesia/blob/main/SMS_spam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [149]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures,StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.stats import loguniform




In [150]:
sw_indo = stopwords.words("indonesian") + list(punctuation)

# Import Data

In [151]:
df = pd.read_csv("/content/drive/MyDrive/spam.csv")
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


# Daset Spliting

In [152]:
X = df[['Teks']]
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914, 1), (229, 1), (914,), (229,))

# Preprocesing

In [153]:

text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize , stop_words=sw_indo))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'Teks')
])



# Training

In [154]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])

parameter = {
    'algo__fit_intercept': [True, False],
    'algo__C': loguniform(1e-3, 1e3)
}

model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits




{'algo__C': 4.676478725076045, 'algo__fit_intercept': True}
0.9989059080962801 0.9638912855910267 0.982532751091703


# Sanity Check

In [166]:
new_text = ["dek belikn abg pulsa di nmr ini, 02937373292,nt aq tf ke kmu"]
text_df = pd.DataFrame({'Teks': new_text})
print(model.predict(text_df))
print(model.predict_proba(text_df))

# berarti 29 persen sms biasa dan 70 persen spama

[1]
[[0.29952939 0.70047061]]


In [168]:
new_text = ["hari ini habis pulang kampus kita main gitar di rumah udin"]
text_df = pd.DataFrame({'Teks': new_text})
print(model.predict(text_df))
print(model.predict_proba(text_df))

# berarti 77 persen sms biasa dan 22 persen spam

[0]
[[0.77014731 0.22985269]]
