# Spam Filter

## Get Datasets

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./datasets/spam_or_not_spam.csv")
X_data = data['email'].astype(str)
y_data = data['label'].astype(int)

## Data Preprocessing Pipeline

In [3]:
# Object: 이메일을 특성 벡터로 변환

# 1. 이메일 헤더 제거 
# 2. 소문자 변환 
# 3. 구두점 제거
# 4. 공백, 특수문자 제거
# 5. 모든 URL, 숫자 대체
# 6. 어간 추출
# 7. 단어 Map 생성 (Dictionary)

In [30]:
# word stemming in python
# after stemming words, put data in dictionary
from nltk.stem import PorterStemmer

# X should be pandas Series type
def preprocess(X):
    
    # word stemming Object
    porter = PorterStemmer()
    
    # Dictionary
    word_map = {}
    
    cnt = 0
    for i in range(len(X.values)):
        tempArr = X.values[i].split(' ')
        for j in range(len(tempArr)):

            # word stemming
            tempArr[j] = porter.stem(tempArr[j])
            
            # put data in dictionary
            if tempArr[j] in word_map:
                continue
            else:
                word_map[tempArr[j]] = cnt
                cnt = cnt + 1
            
        X.values[i] = np.array(tempArr)
    return X, word_map

X_data, word_map = preprocess(X_data)
print("X_data", X_data.shape)

X_data (3000,)


In [31]:
len(X_data.values)

3000

## Make Email as Feature Vector

In [37]:
def convertToFeatureVector(X):
    # make feature vector variable
    x_len = len(X.values)
    word_map_len = len(word_map.keys())
    retArr = []
    for i in range(len(X.values)):
        temp = np.zeros(word_map_len)
        for j in range(len(X.values[i])):
            # set dictionary values
            idx = word_map.get(X.values[i][j])
            temp[idx] = 1
        retArr.append(temp)
    return np.array(retArr)

X_feature_vec = convertToFeatureVector(X_data)
print(X_feature_vec[10])

[1. 0. 0. ... 0. 0. 0.]


In [38]:
# Split Train & Test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_feature_vec, y_data, stratify = y_data, test_size=0.3, random_state=42)

In [40]:
# Sample Email Feature Vector
X_train[0]

array([1., 1., 0., ..., 0., 0., 0.])

## Simple Machine Learning Models



In [43]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
print("Score : ", round(clf.score(X_train, y_train)*100, 2))

Score :  100.0


In [44]:
# with cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print(scores)

[0.94761905 0.97142857 0.98095238 0.96666667 0.96190476 0.98571429
 0.97619048 0.98095238 0.98571429 0.9952381 ]


## Test Set Scores

In [45]:
print("Test Score : ", round(clf.score(X_test, y_test)*100, 2))

Test Score :  97.56
