In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
df = pd.read_csv('data/spam.csv', encoding='latin1')

In [32]:
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 데이터 정제

In [33]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [34]:
df.v2.nunique()

5169

In [35]:
df.drop_duplicates('v2',keep='first', inplace=True)

## 텍스트 전처리

In [36]:
df['cv3'] = df.v2.str.replace('[^a-zA-Z ]','')
df.head()

  df['cv3'] = df.v2.str.replace('[^a-zA-Z ]','')


Unnamed: 0,v1,v2,cv3
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


In [37]:
df.isnull().sum()

v1     0
v2     0
cv3    0
dtype: int64

In [38]:
df.v1.value_counts()

ham     4516
spam     653
Name: v1, dtype: int64

어떤 문장을 넣었을때 내 햄끼리의 유사도와 스팸끼리의 유사도를 비교해서
스팸여부 확인

## 햄 스팸 분류

In [39]:
ham_df = df.set_index('v1').sort_index().T.ham.T

In [40]:
spam_df = df.set_index('v1').sort_index().T.spam.T

## 변환

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    df.cv3, df.v1, test_size= 0.2, #20000개는 train(훈련) 5000개는 test
    stratify=df.v1, random_state=2021
)# straify --> 균등하게 분류
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((4135,), (1034,), (4135,), (1034,))

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(X_train)

TfidfVectorizer(stop_words='english')

In [43]:
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((4135, 7147), (1034, 7147))

## 훈련 및 예측

In [44]:
from sklearn.linear_model import LogisticRegression

In [45]:
lr = LogisticRegression()

In [46]:
lr.fit(X_train_tv,Y_train)

LogisticRegression()

In [47]:
pred = lr.predict(X_test_tv)

## 결과

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(Y_test,pred)

0.9593810444874274