In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

In [9]:
movie = pd.read_csv('movie_sample.csv', encoding='utf-8')
review = pd.read_csv("movie_review.csv", encoding='utf-8')
stopwords = pd.read_csv("한국어불용어.txt", sep='\t', encoding='utf-8')['형태'].to_list()

In [8]:
import re
from konlpy.tag import Kkma
kkma = Kkma()

texts = []

for doc in review['1']:
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
    
    try:
        word_token = kkma.morphs(hangul.sub('', str(doc)))

        stopped_tokens = [word for word in word_token if not word in stopwords]
        texts.append(stopped_tokens)
        
    except:
        texts.append('')

In [9]:
from gensim.models import word2vec

model = word2vec.Word2Vec(texts,
                         workers=4,
                         size=100,
                         min_count=10,
                         window=20,
                         sample=1e-3)

model.init_sims(replace=True)

In [13]:
#model.save('word2vec.model')

In [3]:
#import gensim
#model = gensim.models.Word2Vec.load("word2vec.model")

In [10]:
def makeFeatureVec(words, model, num_features=100):
    featureVec = np.zeros((num_features,), dtype='float32')
    
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
            
    featureVec = np.divide(featureVec, nwords)
    
    return featureVec

In [14]:
makeFeatureVec(sum(texts[:1200], []), model)

array([ 0.00903185,  0.02466408, -0.00457539,  0.04066523,  0.00277325,
       -0.03693898,  0.01675449, -0.01942066, -0.01943677, -0.01452281,
        0.00028518, -0.00390063, -0.01601533,  0.00164893, -0.00951091,
       -0.03272052, -0.02459099,  0.02908111,  0.05383693,  0.04554746,
        0.00159666,  0.01823414, -0.00813711, -0.02457864, -0.00462838,
        0.05315609, -0.02489214, -0.00998518, -0.04191583, -0.00838858,
        0.01646701,  0.0207481 ,  0.04274411,  0.04135553,  0.04033058,
       -0.01253566,  0.00359625,  0.01110219, -0.01204065, -0.00871179,
       -0.00081611,  0.01310383,  0.02267058,  0.00602473, -0.03714225,
        0.02469238, -0.01773507,  0.03065806,  0.04647436,  0.03134697,
       -0.03779117, -0.00041189,  0.03156191,  0.00564605,  0.03295846,
        0.00887572,  0.00886812, -0.02275216, -0.00315293,  0.01806024,
        0.01685125, -0.04536863,  0.03419803, -0.00593382, -0.01245815,
        0.00576158,  0.00626251,  0.00259588, -0.05351615,  0.03

In [15]:
E_word = ['활발', '외향', '유쾌', '씩씩', '자유']
I_word = ['조용', '내향', '잔잔', '신중', '깊이']
S_word = ['오감', '감각', '경험', '정확', '철저']
N_word = ['육감', '영감', '신속', '일', '추구']
F_word = ['사람', '관계', '설명']
T_word = ['진실', '사실', '논리', '분석']
P_word = ['변화', '자율', '융통성']
J_word = ['분명', '체계', '철저']

mbti_words = [E_word] + [I_word] + [S_word] + [N_word] + [F_word] + [T_word] + [P_word] + [J_word]

In [241]:
from sklearn.metrics.pairwise import cosine_similarity

temp = movie.iloc[:61]

for idx, repeat in enumerate(['E', 'I', 'S', 'N', 'F', 'T', 'P', 'J']):
    total_similar = []    
    n = 0
    
    for i in range(len(review['0'].unique())):
        vector = makeFeatureVec([data for inner_list in texts[n:n+1200] for data in inner_list], model)
        similarity_list = []

        for word in mbti_words[idx]:
            try:
                similarity_list.append(cosine_similarity(model[word].reshape(1, -1), vector.reshape(1, -1))[0][0])
            except:
                similarity_list.append(1e-3)

        total_similar.append(np.sum(similarity_list) / len(mbti_words[idx]))
        n += 1200
        
    temp[repeat] = total_similar

In [244]:
cosine_similarity(model['유쾌'].reshape(1, -1) , makeFeatureVec(sum(texts[:1200], []), model).reshape(1, -1))[0][0]

0.43292674

In [245]:
temp.head(3)

Unnamed: 0,movie_id,title,main_link,img_link,img_path,pub_year,user_rating,summary,nation,genre,E,I,S,N,F,T,P,J
0,171539,그린 북,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie.naver.com/movie/bi/mi/photoViewP...,./img/171539.jpg,2019.0,9.59,1962년 미국 입담과 주먹만 믿고 살아가던 토니 발레롱가비고 모텐슨는 교양과 우아...,미국,드라마,0.143855,0.311728,0.176904,0.154328,0.074807,0.16582,0.10864,0.059955
1,174830,가버나움,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie.naver.com/movie/bi/mi/photoViewP...,./img/174830.jpg,2019.0,9.59,나를 세상에 태어나게 한 부모님을 고소하고 싶어요...출생기록조차 없이 살아온 어쩌...,"레바논,프랑스",드라마,0.2373,0.316148,0.192922,0.175665,0.202317,0.224705,0.111336,0.13464
2,192613,디지몬 어드벤처 라스트 에볼루션 인연,https://movie.naver.com/movie/bi/mi/basic.nhn?...,https://movie.naver.com/movie/bi/mi/photoViewP...,./img/192613.jpg,2021.0,9.54,컴퓨터 모니터를 통해 세상 밖으로 나온 의문의 알.타이치와 히카리 남매 앞에 디지몬...,일본,"애니메이션,모험",0.128232,0.194875,0.073097,0.080915,0.132798,0.041093,-0.102754,0.048952


In [246]:
#temp.to_csv('movie_mbti_similarity.csv', encoding='utf-8', index=False)

### 여기서부터 실행

In [2]:
temp = pd.read_csv("movie_mbti_similarity.csv")

In [3]:
def Label_Normalize(labels):
    from sklearn.preprocessing import MinMaxScaler
    transformer = MinMaxScaler(feature_range=(-1, 1))

    label = np.array(labels).reshape(-1, 1)
    transformer.fit(label)
    
    return transformer.transform(label)

In [5]:
Label_Normalize([1, 5, 2, 4, 5, 1, 7, 3])

array([[-1.        ],
       [ 0.33333333],
       [-0.66666667],
       [ 0.        ],
       [ 0.33333333],
       [-1.        ],
       [ 1.        ],
       [-0.33333333]])

In [6]:
temp.iloc[:, -8:]

Unnamed: 0,E,I,S,N,F,T,P,J
0,0.143855,0.311728,0.176904,0.154328,0.074807,0.165820,0.108640,0.059955
1,0.237300,0.316148,0.192922,0.175665,0.202317,0.224705,0.111336,0.134640
2,0.128232,0.194875,0.073097,0.080915,0.132798,0.041093,-0.102754,0.048952
3,0.075171,0.253160,0.055551,0.046675,0.124074,0.049829,-0.160054,0.042024
4,0.198201,0.295274,0.177920,0.137660,0.167343,0.205257,0.111980,0.100524
...,...,...,...,...,...,...,...,...
56,0.134950,0.320024,0.158222,0.093443,0.175022,0.166182,0.015627,0.107301
57,0.108622,0.246237,0.200643,0.176537,0.030135,0.106614,0.209547,0.067846
58,0.159602,0.198301,0.197712,0.203157,0.099553,0.135238,0.210894,0.152618
59,0.179183,0.250317,0.247508,0.221815,0.106952,0.179671,0.258503,0.141519


In [16]:
def recommendation(labels):
    from operator import itemgetter
    recommend = []
    
    for idx, data in enumerate(range(len(temp))):
        label = Label_Normalize(labels)
        
        from scipy.spatial import distance
        distance = distance.euclidean(sum(label.tolist(), []), temp.iloc[idx, -8:].tolist())
        recommend.append((idx, distance))
        
    recommend.sort(key=itemgetter(1))
    print(temp.iloc[pd.Series(recommend[:5]).apply(lambda x: x[0]).values]['title'])

### Examples

In [22]:
#'E', 'I', 'S', 'N', 'F', 'T', 'P', 'J'
#각 성격 축에 대한 강도 1 매우 아니다 <---> 7 매우 그렇다

In [17]:
recommendation([7, 1, 1, 7, 7, 1, 7, 1]) #ENFP

37       반지의 제왕 왕의 귀환
39          어벤져스 엔드게임
53      반지의 제왕 두 개의 탑
20    잭 스나이더의 저스티스 리그
13        터미네이터 2오리지널
Name: title, dtype: object


In [18]:
recommendation([7, 1, 1, 7, 5, 1, 5, 1]) #ENFP

53            반지의 제왕 두 개의 탑
37             반지의 제왕 왕의 귀환
39                어벤져스 엔드게임
13              터미네이터 2오리지널
2     디지몬 어드벤처 라스트 에볼루션  인연
Name: title, dtype: object


In [19]:
recommendation([1, 7, 1, 7, 7, 1, 7, 1]) #INFP

54        주토피아
40         알라딘
35       헌터 킬러
14     나 홀로 집에
8     포드 V 페라리
Name: title, dtype: object


In [20]:
recommendation([1, 5, 1, 5, 7, 1, 7, 1]) #INFP

54        주토피아
40         알라딘
35       헌터 킬러
19    보헤미안 랩소디
14     나 홀로 집에
Name: title, dtype: object


- 영화 리뷰 데이터 크롤링 아직 덜 함

- 음악 리뷰 데이터도 크롤링 x

- 불용어 사전 추가

- 성향 분류 단어 어떻게

- **pretrained word2vec에 리뷰 데이터 추가학습 (전이학습)** https://frhyme.github.io/python-libs/gensim0_word2vec_1google_model/

In [769]:
ko_model = gensim.models.Word2Vec.load('ko.bin')
ko_model.wv.most_similar("활발", topn=5)

[('왕성', 0.7961682081222534),
 ('활발히', 0.7213901281356812),
 ('광범위', 0.6626321077346802),
 ('꾸준', 0.6609542369842529),
 ('빈번', 0.6394011974334717)]