In [28]:
import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
# 경고메시지 무시
import warnings
warnings.filterwarnings("ignore")

In [24]:
word_vectors = Doc2Vec.load("../Embedding/embedding_data/Doc2Vec_embedding.model").wv
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))
positive = model.cluster_centers_[0]
negative = model.cluster_centers_[1]

In [25]:
model.cluster_centers_

array([[ 5.59585193e-06, -4.83617273e-05,  1.42859361e-05,
         8.04694068e-05,  7.41719859e-05, -5.54235311e-05,
        -3.23251240e-05, -1.76736503e-05, -5.97012542e-05,
         1.83107852e-05, -4.79523221e-05,  4.30927782e-05,
         2.30908526e-05,  4.03212376e-05,  4.27417509e-05,
        -1.62916245e-05,  2.83138939e-05,  2.91525457e-05,
         8.70474658e-05,  1.17958493e-04,  5.01608006e-05,
         2.20497261e-05,  2.78054483e-05, -8.15023123e-06,
         3.91708487e-05,  5.34977657e-05,  7.17143886e-05,
         2.07332125e-05,  2.05795121e-05, -6.06230291e-05,
         3.77624447e-05, -7.13859924e-05, -1.47551518e-05,
         4.52761975e-05, -1.93509807e-05,  3.53931678e-05,
        -3.18694289e-05,  4.49783544e-05, -2.04206902e-05,
         4.49237536e-05,  5.36450336e-06,  1.30944280e-05,
         5.83623013e-05, -4.52851402e-05, -9.24091003e-05,
         2.67124724e-05, -2.57584568e-05, -3.65080286e-05,
        -4.31771336e-06,  1.51902429e-05,  2.48213673e-0

In [26]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('텅텅', 0.2780112624168396),
 ('무고', 0.21778829395771027),
 ('어이없이', 0.21014109253883362),
 ('마뉘', 0.21013349294662476),
 ('충분', 0.20824849605560303),
 ('비치', 0.20570695400238037),
 ('공금', 0.20408347249031067),
 ('계륜미', 0.2012607604265213),
 ('요란', 0.2003171741962433),
 ('남하', 0.1982894390821457)]

In [27]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [32]:
DATA_IN_PATH = '../data/preprocess/'
# 데이터 불러오기 !
KorQuAd = pd.read_csv(DATA_IN_PATH + 'KorQuAd_preprocess_Okt.csv', encoding='UTF8')
naverReview = pd.read_csv(DATA_IN_PATH + 'naverReview_preprocess_Okt.csv', encoding='UTF8')
ultraDiary = pd.read_csv(DATA_IN_PATH + 'ultraDiary_preprocess_Okt.csv', encoding='UTF8')
#데이터를 하나의 데이터 프레임으로 합치자!
data = KorQuAd.append(naverReview)
data = data.append(ultraDiary)
print(data.shape) # 출력 결과 23만 5000개에 가가운 데이터 !
data.head()

(234941, 1)


Unnamed: 0,context
0,년 바그너 괴테 파우스트 을 처음 읽다 그 내용 에 마음 끌리다 이르다 소재 로 하...
1,한편 년 부터 바그너 와 알 고 지내다 리스트 잊혀지다 있다 악장 을 부활 시키다 ...
2,작품 라단조 아주 신중하다 박자 부드럽다 서주 로 서주 로 시작 되다 여기 에는 주...
3,커닐링구스커닐 링거 스 쿤닐링구스 영어 늘다 입술 혀 입 모든 구강 기관 으로 여성...
4,일반 적 인 통계 에 따르다 여성 가다 오르가슴 을 얻다 위해 직접 적 인 음핵 자...


In [35]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(data)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(data)

In [37]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [38]:

def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out
  