In [None]:
import pandas as pd
import numpy as np

#importing NLP libraries
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#Preprocessing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Modeling
from sklearn.svm import SVC
#dataset이 skewed되어 있으므로 그냥 accuracy로 metric을 사용하면 안 된다.
#precision과 recall이 함께 들어가있는 f1_score사용!!
from sklearn.metrics import f1_score

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')
data.head()

In [None]:
data.drop([data.columns[col] for col in [2,3,4]], axis=1, inplace=True)

In [None]:
y = data['v1']
X = data.drop('v1', axis=1)

In [None]:
X

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)
pd.DataFrame(y)

> ** Text 형태로 되어 있는 X를 어떻게 바꿀지가 관건**

In [None]:
class_mapping = {i : v for i,v in enumerate(encoder.classes_)}
class_mapping

In [None]:
#Take an email string and convert it into a list of stemmed words
#stemmed words라는 것은 stem(줄기)를 다 잘라내고 root만 남긴 words 
#ex) words, word=> word로 convert

In [None]:
def processEmail(contents):
    ps = PorterStemmer()
    
    #다 소문자로 만들기
    contents = contents.lower()
    #contents에서 html tags를 제거
    #bracket안의 carrot sign은 그 뒤에 것들 제거하고~라는 의미
    contents = re.sub(r'<[^<>]+>', ' ', contents)
    #모든 숫자들을 그냥 'number'라는 str로 바꿔줌
    contents = re.sub(r'[0-9]+', 'number', contents)
    
    contents = re.sub(r'(http|https)://[^/s]*', 'httpaddr', contents)
    contetns = re.sub(r'[^/s]+@[^/s]+', 'emailaddr', contents)
    
    contents = re.sub(r'[$]+', 'dollar', contents)
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word)>=1]
    return words

In [None]:
#Take a list of emails and get a dictionary of the most common words
#가장 많이 나온 단어들로 정렬하고 정의한 vocab_length의 길이로 자른다.
def getVocabulary(emails, vocab_length):
    vocabulary= dict()
    
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word]+=1
            else:
                vocabulary[word]=1
    
    vocabulary = sorted(vocabulary.items(), key = lambda x : x[1], reverse=True)
    #chop with vocab length
    
    vocabulary = list(map(lambda x : x[0], vocabulary[:vocab_length]))
    vocabulary = {i:v for i,v in enumerate(vocabulary)}
    
    return vocabulary

In [None]:
data['v2'].tolist()

In [None]:
getVocabulary(data['v2'].tolist(),10)

In [None]:
#Get a dictionary key given a value
def getkey(dictionay, val):
    for key, value in dictionay.items():
        if value==val:
            return key

In [None]:
#Get the indices of vocab words used in a given email
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getkey(vocabulary, word))
    
    return word_indices

In [None]:
def getFeatureVector(word_indices, vocab_length):
    feature_vec = np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i]=1
    
    return feature_vec

In [None]:
vocab_length = 2000

In [None]:
vocabulary = getVocabulary(data['v2'].to_list(),vocab_length)
emails = data['v2'].to_list()
emails = list(map(lambda x : processEmail(x), emails))

In [None]:
emails

In [None]:
X = list(map(lambda x : getFeatureVector(getIndices(x, vocabulary), vocab_length), emails))

In [None]:
X = pd.DataFrame(np.array(X).astype(np.int16))

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
model = SVC()
model.fit(X_train, y_train)

In [None]:
#Having skewed class
np.sum(y)/len(y)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)