In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [6]:
# 데이터 준비
messages_df = pd.read_csv("data-files/SMSSpamCollection", sep="\t", header=None, names=["target", "message"])
messages_df

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
# target 문자열 수치형으로 변환

messages_df['label'] = messages_df['target'].map(lambda v: 0 if v == 'ham' else 1)
messages_df

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
messages_df['label2'] = le.fit_transform(messages_df['target'])
messages_df

Unnamed: 0,target,message,label,label2
0,ham,"Go until jurong point, crazy.. Available only ...",0,0
1,ham,Ok lar... Joking wif u oni...,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1
3,ham,U dun say so early hor... U c already then say...,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,1
5568,ham,Will ü b going to esplanade fr home?,0,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0,0
5570,ham,The guy did some bitching but I acted like i'd...,0,0


In [22]:
# message 전처리 1. (정규 표현식을 사용해서) 문자, 공백을 제외한 특수문자 제거

import re # 정규표현식 처리 모듈

# messages_df["message"].map(lambda m: m.replace('$', '').replace('#', ''))
messages_df['message2'] = messages_df["message"].map(lambda m: re.sub('[^\w\s]', '', m)) # 영숫자, 공백 제외한 나머지 문자는 제거
# messages_df['message2'] = messages_df['message'].str.replace('[^\w\s]', '', regex=True) # 영숫자, 공백 제외한 나머지 문자는 제거

In [23]:
messages_df.head()

Unnamed: 0,target,message,label,label2,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,0,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,0,0,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,Nah I dont think he goes to usf he lives aroun...


In [24]:
# message 전처리 2. 모두 소문자로 변환

# messages_df['message2'] = messages_df['message2'].map(lambda m: m.lower())
messages_df['message2'] = messages_df['message2'].str.lower()
messages_df.head()

Unnamed: 0,target,message,label,label2,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,0,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,nah i dont think he goes to usf he lives aroun...


In [15]:
import nltk

nltk.download('punkt') # 필요한 모듈 다운로드 : word_tokenize 함수 실행에 필요

[nltk_data] Downloading package punkt to C:\Users\Administrator.User
[nltk_data]     -2023YNCQT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [25]:
# message 전처리 3. 문장을 공백으로 분리해서 단어 리스트 변환

messages_df['message2'] = messages_df['message2'].map(nltk.word_tokenize)
messages_df.head()

Unnamed: 0,target,message,label,label2,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,0,0,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [28]:
# message 전처리 4. 어근 추출 ( 단어의 활용 표준화 )
from nltk.stem import PorterStemmer # 어근 추출기

stemmer = PorterStemmer()
print( stemmer.stem("apples"), stemmer.stem("apple") )
messages_df['message2'] = messages_df['message2'].map(lambda words: [ stemmer.stem(word) for word in words ])

messages_df.head()

appl appl


Unnamed: 0,target,message,label,label2,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,0,0,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."
