In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ', 
]

In [3]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [4]:
print(cv.fit_transform(corpus).toarray()) # DFM

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]


In [5]:
print(cv.vocabulary_) # 사전의 인덱스와 이름을 확인 가능

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [6]:
print(tfidf.fit_transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


In [7]:
print(tfidf.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


## transform결과를 역으로 추출

In [9]:
cv.inverse_transform(cv.fit_transform(corpus).toarray())

[array(['know', 'love', 'want', 'you', 'your'], dtype='<U6'),
 array(['like', 'you'], dtype='<U6'),
 array(['do', 'should', 'what'], dtype='<U6')]

In [10]:
cv.get_feature_names()

['do', 'know', 'like', 'love', 'should', 'want', 'what', 'you', 'your']

In [11]:
for i, value in enumerate(cv.get_feature_names()) :
    print(i, ':', value)

0 : do
1 : know
2 : like
3 : love
4 : should
5 : want
6 : what
7 : you
8 : your


## 새로운 문장을 넣어서 어떤 단어들이 들어갔는지 판단해보기

In [12]:
sentence = ['i like like smile want']

In [14]:
cv.transform(sentence).toarray()

array([[0, 0, 2, 0, 0, 1, 0, 0, 0]], dtype=int64)

--------------------------------------------------------------------------------------------------------

In [21]:
spam_dict = ['advertise','promotion','sales','hu','special','sale','member','news','buy','big']
ham_dict = ['order','confirm','agree','check','customer','payment','send','genetal','company','tour']

In [22]:
cv1 = CountVectorizer()
tfidf1 = TfidfVectorizer()
cv2 = CountVectorizer()
tfidf2 = TfidfVectorizer()

In [23]:
## spam 사전 만들기
cv1.fit(spam_dict)
tfidf1.fit(spam_dict)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [24]:
## ham 사전 만들기
cv2.fit(ham_dict)
tfidf2.fit(ham_dict)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [25]:
cv1.vocabulary_ #spam

{'advertise': 0,
 'promotion': 6,
 'sales': 8,
 'hu': 3,
 'special': 9,
 'sale': 7,
 'member': 4,
 'news': 5,
 'buy': 2,
 'big': 1}

In [26]:
cv2.vocabulary_ #ham

{'order': 6,
 'confirm': 3,
 'agree': 0,
 'check': 1,
 'customer': 4,
 'payment': 7,
 'send': 8,
 'genetal': 5,
 'company': 2,
 'tour': 9}

In [27]:
email = ['promation!! hu good sales sale check payment']

In [28]:
result1 = cv1.transform(email).toarray() #spam
result1 #2차원

array([[0, 0, 0, 1, 0, 0, 0, 1, 1, 0]], dtype=int64)

In [29]:
result2 = cv2.transform(email).toarray() #ham
result2

array([[0, 1, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [30]:
spam_feature_list = cv1.get_feature_names()
ham_feature_list = cv2.get_feature_names()

In [31]:
print(spam_feature_list)
print(ham_feature_list)

['advertise', 'big', 'buy', 'hu', 'member', 'news', 'promotion', 'sale', 'sales', 'special']
['agree', 'check', 'company', 'confirm', 'customer', 'genetal', 'order', 'payment', 'send', 'tour']


In [32]:
result1[0]

array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)

In [33]:
spam_words_count = sum(result1[0]) ## 스팸에 등장하는 단어의 빈도수
spam_words_count 

3

In [35]:
ham_words_count = sum(result2[0])
ham_words_count

2

In [36]:
if spam_words_count > ham_words_count:
    print('spam mail!!')
else:
    print('ham mail!!')

spam mail!!


--------------------------------------------------------------------------

## 한글로 사전 만들어보고, 스탬인지 아닌지 판단해보기

In [39]:
email2 = ['광고!! 허경영 good 세일 확인 지불']

In [40]:
spam_dict = ['광고','허영경','세일','지불','고수익','특가','홍보','회사','회원','판매']
ham_dict = ['주문','확인','체크','고객','지불','배송','일반','동호회']

In [41]:
k_cv1 = CountVectorizer()
k_tfidf1 = TfidfVectorizer()
k_cv2 = CountVectorizer()
k_tfidf2 = TfidfVectorizer()

In [43]:
# spam 사전 만들기
k_cv1.fit(spam_dict)
k_tfidf1.fit(spam_dict)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [44]:
## ham 사전 만들기
k_cv2.fit(ham_dict)
k_tfidf2.fit(ham_dict)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [45]:
k_cv1.vocabulary_ #spam

{'광고': 1,
 '허영경': 6,
 '세일': 2,
 '지불': 3,
 '고수익': 0,
 '특가': 4,
 '홍보': 7,
 '회사': 8,
 '회원': 9,
 '판매': 5}

In [46]:
k_cv2.vocabulary_ #ham

{'주문': 4, '확인': 7, '체크': 6, '고객': 0, '지불': 5, '배송': 2, '일반': 3, '동호회': 1}

In [47]:
result3 = k_cv1.transform(email2).toarray() #spam
result3 #2차원

array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [49]:
result4 = k_cv2.transform(email2).toarray() # ham
result4

array([[0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [51]:
spam_feature_list = k_cv1.get_feature_names()
ham_feature_list = k_cv2.get_feature_names()

In [52]:
print(spam_feature_list)
print(ham_feature_list)

['고수익', '광고', '세일', '지불', '특가', '판매', '허영경', '홍보', '회사', '회원']
['고객', '동호회', '배송', '일반', '주문', '지불', '체크', '확인']


In [55]:
result3[0]

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [56]:
spam_words_count = sum(result3[0]) ## 스팸에 등장하는 단어의 빈도수
spam_words_count 

3

In [57]:
ham_words_count = sum(result4[0])
ham_words_count

2

In [58]:
if spam_words_count > ham_words_count:
    print('스팸입니다.!!')
else:
    print('아닙니다.!!')

스팸입니다.!!


## 감정분류기를 만들어 보기

## 긍정/부정 사전을 만들어보기

In [None]:
positive_dict = ['happy','conviction ','courage ','justice ','positive ','romance ','beatitude ','cohesion ','diligence ','brilliant ']
negative_dict = ['sad','negative','lonely','bored','annoyed','angry','nervous','scared','mess','pissed']

In [None]:
## input()을 이용해서 문장을 입력 받으면, 긍정인지 부정인지 판단해보기

In [None]:
## tf-idf vectorizer를 사용하기