In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

In [2]:
DATA_IN_PATH = '../Embedding/embedding_data/'
model = Word2Vec.load(DATA_IN_PATH + 'Word2Vec_embedding.model').wv
# Word2Vec 모델 불러오기

In [3]:
label_data = pd.read_csv('../data/naverReview_label.csv')
train_data = pd.read_csv('../data/preprocess/naverReview_preprocess_Okt.csv', encoding = 'UTF8')
#필요한 라벨, 트레인 데이터 불러오기

In [4]:
new_data = pd.concat([train_data, label_data], axis=1)
#nan제거를 위해 합치기

In [5]:
new_data2 = new_data.dropna()
#nan 제거

In [6]:
labels = list(new_data2['label'])
contexts = list(new_data2['context'])
#라벨 리뷰 분리
sentences = []
for context in contexts :
    sentences.append(str(context).split())

In [7]:
def get_feature(words, model, num_features) :
    feature_vector = np.zeros((num_features), dtype = np.float32)
    
    num_words = 0
    index2word_set = set(model.wv.index2word)
    
    for w in words :
        if w in index2word_set :
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [8]:
def get_dataset(reviews, model, num_features) :
    dataset = list()
    
    for s in reviews :
        dataset.append(get_feature(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [9]:
num_features = 100
test_data_vecs = get_dataset(sentences, model, num_features)

  index2word_set = set(model.wv.index2word)
  feature_vector = np.divide(feature_vector, num_words)


In [10]:
test_data = test_data_vecs.tolist()
test_data_df = pd.DataFrame({'context' : test_data})
label_df = pd.DataFrame({'label': labels})
#nan 제거를 위해 각각 df 화

In [11]:
new_df = pd.concat([test_data_df, label_df], axis=1)
#nan 제거를 위해 병합

In [12]:
nan_index = []
for index in range(198315) :
    npArray = np.asarray(new_df['context'].iloc[index], dtype=np.float32)
    if np.isnan(npArray).any() :
        nan_index.append(index)
        
print(nan_index)
#nan 찾는 함수

[28, 79, 169, 414, 1150, 1349, 1575, 1715, 2328, 2352, 2440, 3451, 4307, 4830, 5043, 5748, 7187, 7839, 8395, 9407, 9726, 10678, 11061, 11764, 12829, 12875, 13011, 13646, 13788, 15579, 15838, 16952, 18795, 18824, 19493, 20631, 21531, 23298, 24865, 25154, 25315, 26152, 26610, 27432, 27830, 28819, 28946, 29290, 29991, 30575, 31020, 31330, 32639, 33140, 33310, 33322, 34482, 34686, 35400, 36802, 37622, 38009, 40223, 40593, 40957, 41267, 43869, 44074, 44510, 44768, 45315, 45495, 46047, 46089, 46840, 47074, 47514, 47636, 48189, 48605, 49583, 49615, 49736, 50449, 50785, 51902, 52871, 53624, 53829, 54425, 54956, 55527, 55768, 56424, 57196, 57906, 57975, 58342, 59817, 61460, 61744, 62836, 63006, 63169, 63668, 64173, 64581, 64733, 65351, 65453, 66323, 66634, 67217, 67848, 68139, 68902, 69137, 69261, 70715, 71386, 71767, 72591, 73434, 74589, 75081, 75166, 75362, 75542, 76910, 77324, 78809, 78856, 79189, 79482, 79881, 80651, 81253, 81485, 82068, 82181, 83389, 84432, 86131, 86614, 88291, 88347, 8897

In [13]:
new_df2 = new_df.drop(nan_index)
new_df2
#nan 제거

Unnamed: 0,context,label
0,"[-0.3545059561729431, -0.04077335447072983, 0....",0
1,"[-0.26670029759407043, 0.008713889867067337, -...",1
2,"[0.04544483870267868, -0.005405960604548454, 0...",0
3,"[-0.24363183975219727, -0.09286148101091385, 0...",0
4,"[-0.07732874900102615, -0.03550983592867851, -...",1
...,...,...
198310,"[-0.0050431666895747185, -0.032580893486738205...",1
198311,"[-0.16151383519172668, -0.004260162357240915, ...",0
198312,"[-0.20260755717754364, -0.05997707322239876, 0...",0
198313,"[-0.10150811821222305, -0.03976898640394211, 0...",0


In [14]:
labels2 = list(new_df2['label'])
contexts2 = list(new_df2['context'])
#랜덤포레스트에 필요한 데이터 분리

In [15]:
from sklearn.model_selection import train_test_split

X = np.array(contexts2)
y = np.array(labels2)

RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_train, y_train)
#랜덤 포레스트 모델

RandomForestClassifier()

In [19]:
print("훈련 데이터 정확도 : %f" % forest.score(X_train, y_train))
print("테스트 데이터 정확도 : %f" % forest.score(X_test, y_test))
#정확도

훈련 데이터 정확도 : 0.995997
테스트 데이터 정확도 : 0.821475


In [20]:
import os
import re
from konlpy.tag import Okt
okt=Okt()

In [21]:
def preprocessing(review, okt, remove_stopwords = False):
    review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", review)
    word_review = okt.morphs(review_text, stem = True)
    
    word_review = ' '.join(word_review)
    
    return word_review
#예측을 위한 함수

In [22]:
def get_feature_predict(words, model, num_features) :
    feature_vector = np.zeros((num_features), dtype=np.float32)
    num_words = 0
    index2word_set = set(model.wv.index2word)
    for w in words.split() :
        if w in index2word_set :
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[w])
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [25]:
def predict(review) :
    global model, num_features
    clean_review = preprocessing(review, okt, remove_stopwords=False)
    result = get_feature_predict(clean_review, model, num_features)
    List = []
    List.append(result)
    return forest.predict(List)

In [26]:
predict("와 진짜 또 보고싶다")

  index2word_set = set(model.wv.index2word)
  feature_vector = np.add(feature_vector, model.wv[w])


array([1])