In [1]:
#Import libraries
import numpy as np 
import pandas as pd 
import nltk
import string

### Preprocessing

In [2]:
# Load data
#df_train, df_val, df_test
df_train = pd.read_csv('spam.zip/training_data.csv')
df_val = pd.read_csv('spam.zip/val_data.csv')
df_test = pd.read_csv('spam.zip/testing_data.csv')
df_train.head()

Unnamed: 0,type,text
0,ham,"Babe, I'm back ... Come back to me ..."
1,ham,S:)no competition for him.
2,ham,Yup having my lunch buffet now.. U eat already?
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...


In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#Preproses data yang digunakan yaitu terdiri dari 4 tahap yaitu tokenization, remove punctuation, lemmatization, dan stopword elimination
def preprocessing(text):
    # 1. Tokenization using nltk
    tokens =  nltk.word_tokenize(text)
    # 2. Remove punctuation
    remove_punc = [token for token in tokens if token.isalpha()]
    # 3. Normalization (lemmatization) using nltk
    lemmatizer = WordNetLemmatizer()
    
    after_lemma = [lemmatizer.lemmatize(w) for w in remove_punc]
    # 4. Stopword elimination using nltk
    elim_stop_word = ' '.join([word for word in after_lemma if not word.lower() in stopwords.words('english')])
    
    return elim_stop_word

In [4]:
preprocess_data = df_train.text.apply(preprocessing)

In [36]:
print(preprocess_data)

0                                     Babe back Come back
1                                             competition
2                          Yup lunch buffet U eat already
3       Storming msg Wen u lift phne u say HELLO u knw...
4       Mark work tomorrow get work house meet u after...
                              ...                        
4497                   Hi sorry missed call pls call back
4498    Hello Sort town already dont rush home eating ...
4499                           Free Msg Ringtone http wml
4500                   Yes tv always available work place
4501    dude fake frnds got money thts reffering u u m...
Name: text, Length: 4502, dtype: object


### Feature Extraction

In [5]:
# feature extraction menggunakan tf-idf yang sudah didefinisikan sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
# fit transform akan melakukan transformasi terhadap hasil data yang sudah dipreproses sebelumnya
X = vectorizer.fit_transform(preprocess_data)

print(X.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Classification

In [6]:
# 1.Using KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# parameter yang dipilih yaitu berdasarkan jumlah neighbor yang digunakan pada model
neighbor = [1,3,5,7]
knn_model = []
for n in neighbor:
    knn = KNeighborsClassifier(n_neighbors=n)
    # train model using training set
    knn.fit(X, df_train.type)
    
    # predict val data
    val_type_predict =  knn.predict(vectorizer.transform(df_val.text))
    # calculate model accuracy using metrics.accuracy_score
    print("Accuracy with n_neighbors =" , n, " :",metrics.accuracy_score(df_val.type, val_type_predict))
    knn_model.append(knn)

Accuracy with n_neighbors = 1  : 0.9540918163672655
Accuracy with n_neighbors = 3  : 0.9301397205588823
Accuracy with n_neighbors = 5  : 0.9101796407185628
Accuracy with n_neighbors = 7  : 0.9001996007984032


In [7]:
# Clasification use model with n_neighbors = 1
test_type_predict =  knn_model[0].predict(vectorizer.transform(df_test.text))
print("Accuracy with n_neighbors = 1 :",metrics.accuracy_score(df_test.type, test_type_predict))

Accuracy with n_neighbors = 1 : 0.947841726618705


In [38]:
print(test_type_predict[:50])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'spam']


In [14]:
# 2. Using SVM yang telah disediakan sklearn
from sklearn.svm import SVC

# parameter yang dipilih yaitu berdasarkan tipe kernel yang digunakan pada model
kernel_type = ['linear', 'poly', 'rbf', 'sigmoid']
svm_model = []
for k in kernel_type:
    clf = SVC(kernel=k)
    
    #train model
    clf.fit(X, df_train.type)
    
    # predict val data
    val_type_predict =  clf.predict(vectorizer.transform(df_val.text))
    # calculate model accuracy using metrics.accuracy_score
    print("Accuracy SVM with kernel '", k, "' :",metrics.accuracy_score(df_val.type, val_type_predict))
    svm_model.append(clf)

Accuracy SVM with kernel ' linear ' : 0.9820359281437125
Accuracy SVM with kernel ' poly ' : 0.9421157684630739
Accuracy SVM with kernel ' rbf ' : 0.9820359281437125
Accuracy SVM with kernel ' sigmoid ' : 0.9780439121756487


In [15]:
# Choose clasification model SVM with kernel = linear
test_type_predict =  svm_model[0].predict(vectorizer.transform(df_test.text))
print("Accuracy SVM with kernel 'linear' : ",metrics.accuracy_score(df_test.type, test_type_predict))

Accuracy SVM with kernel 'linear' :  0.9820143884892086


In [39]:
print(test_type_predict[:50])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'spam']


In [34]:
# 3. Using RandomForest yang telah disediakan sklearn
from sklearn.ensemble import RandomForestClassifier

# parameter yang dipilih yaitu berdasarkan tipe criterion yang digunakan pada model
criterion_type = ['gini', 'entropy']
rf_model = []
for c in criterion_type:
    spam_model_rf = RandomForestClassifier(criterion=c, random_state=0, n_estimators=50)

    #train model
    spam_model_rf.fit(X,df_train.type)
    
    # predict val data
    val_type_predict =  spam_model_rf.predict(vectorizer.transform(df_val.text))
    # calculate model accuracy using metrics.accuracy_score
    print("Accuracy Random Forest with criterion '", c, "' :",metrics.accuracy_score(df_val.type, val_type_predict))
    rf_model.append(spam_model_rf)

Accuracy Random Forest with criterion ' gini ' : 0.9820359281437125
Accuracy Random Forest with criterion ' entropy ' : 0.9780439121756487


In [35]:
# Choose clasification model RF with criterion = gini
test_type_predict =  rf_model[0].predict(vectorizer.transform(df_test.text))
print("Accuracy Random Forest with criterion 'gini' : ",metrics.accuracy_score(df_test.type, test_type_predict))

Accuracy Random Forest with criterion 'gini' :  0.9820143884892086


In [40]:
print(test_type_predict[:50])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'spam']
