### Belajar Klasifikasi Spam Pada SMS
Menggunakan Dataset [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'text'])

In [8]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Text Cleaning and Preprocessing

In [11]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [13]:
ps = PorterStemmer()

In [14]:
corpus = []

In [16]:
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i]) # menghilangkan karakter selain huruf
    review = review.lower() # mengubah teks menjadi lowercase
    review = review.split() # split setiap kata
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # stopword removal
    review = ' '.join(review) # merge kembali setiap kata
    corpus.append(review) # input ke dalam array corpus

proses cleaning telah selesai

#### Preprocessing

In [18]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [21]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Label Encoding

In [19]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

### Train Test Split

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Classifying using SVM

In [22]:
from sklearn.svm import SVC

In [23]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)

SVC(gamma='auto')

In [24]:
y_pred = clf.predict(X_test)

### Evaluation

In [56]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       955
           1       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115

