• 자신이 원하는 영화의 리뷰데이터를 수집하여 Word2Vec 모델 을 만들고 특정 단어를 입력하여 유사 단어를 나열해보시오.

In [None]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
import urllib.request
import re
from konlpy.tag import Okt
import matplotlib.pyplot as plt

In [None]:
#영화 기생충 리뷰데이터

pres=['https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code=161967&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=highest&page=1',
'https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code=161967&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=lowest&page=1']

review = []
rate =[]
target=[]

for pre in pres:
    for i in range(1,400):
        url=pre+str(i)
        res=requests.get(url)
        soup=BeautifulSoup(res.content,'html.parser')
        
        id_list=[]
        id_pre='_filtered_ment_'
        
        for i in range(10):
            id_list.append(id_pre+str(i))
        
        for id in id_list:
            review.append(soup.find('span',{'id':id}).get_text().strip())
            
        rate_list =[]
        rate_list =(soup.select('div.star_score > em'))
        
        for i in range(10):
            r = int(re.sub('<.+?>','',str(rate_list[i])))
            rate.append(r)
            if r>=8:
                target.append(1)
            elif r<=4:
                target.append(0)
            else:
                target.append(-1)
        

df=pd.DataFrame({'review':review,'rate':rate,'target':target})
df

In [None]:
df.to_csv('review_data.csv')

In [None]:
#Null값이 존재하는 행 제거 & 존재하는지 확인 
train_data = pd.read_csv('review_data.csv')
train_data = train_data.dropna(how = 'any')
print(train_data.isnull().values.any())


In [None]:
#한글이 아닌 경우 제외
train_data['review'] = train_data['review'].str.replace("[^ㄱ-ㅎ ㅏ-ㅣ 가-힣]","")
train_data

In [None]:
stopwords= ['으로','이다','ㅋㅋ','그래서','ㅎㅎ','ㅠㅠ','습니다','있다','했는지','있게','ㅠ','~~','^^','이라는걸','이해','있었다','한다','인다','않다','그리고',
            '의','가','로','에서','진짜','이고','없다','으로','정말','영화','하다','입니다','합니다','씩','때','에','차라리','오다','왔다','같다','에요','니다',
            '했다','그냥','가장','에게','까지','진짜','싶다','보다','했습니다', '된다','봤는데','본다', '했는데','아쉽다','역시','ㄹㅇ','ㅜ','처음','ㄷㄷ',';;;',
            '그리고','너무','느끼다','부터','생각','!!!','~~~','근데','팝콘','정도','하고','한테','아니다','되어다','안된다','인데','어떻다','인가','라고','처럼',
            '되다','깊다','남다','받다','대한','보이다','남다','나다','들다','이렇다','아주','때문','맞다','가다','밖에','나오다','하나','보지','라는','지다','주다',
            '내다','라는','알다','에는','이라','하지만','말다','맞다','넘다','라는','자다','지만','이렇다','이나']

In [None]:
#토큰화 작업 
okt = Okt()
tokenized_data = []

for sentence in train_data ['review']:
    temp_X = okt.morphs(sentence,stem = True)
    #불용어제거, 2글자 이상의 단어만 추출
    temp_X = [word for word in temp_X if not word in stopwords]
    temp_X = [word for word in temp_X if len(word)>1]
    tokenized_data.append(temp_X)

In [None]:
print("리뷰의 최대 길이:", max(len(l) for l in tokenized_data))
print("리뷰의 평균 길이:", sum(map(len, tokenized_data))/ len(tokenized_data))

#histogram graph
plt.hist([len(s) for s in tokenized_data], bins = 50)
plt.xlabel('lenght of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg=0)
#완성된 메트릭스 크기 확인 
model.wv.vectors.shape

In [None]:
#10개의 단어 유사도 확인
for i in range(10):
    result = model.wv.most_similar([input()])
    print(result)
    print("-"*120)

In [None]:
#전처리된 단어들 그래프로 표현하기
from sklearn.decomposition import PCA
from matplotlib import font_manager, rc

In [None]:
#문장을 이용하여 단어와 백터를 생성, 단어백터를 구함
sentences= []

for i in range(1,2):
    sentences.append(tokenized_data[i])

model = Word2Vec(sentences = tokenized_data, size = 100, min_count = 70, workers = 4, sg=0)
word_vectors = model.wv

vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

In [None]:
print(vocabs)

In [None]:
pca = PCA(n_components = 2)
xys = pca.fit_transform(word_vectors_list)
xs = xys[:,0]
ys = xys[:,1]

In [None]:
import matplotlib as mpl
print ('캐시 위치: ', mpl.get_cachedir())

!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic') 

In [None]:
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font',family=font)

plt.figure(figsize = (20,13))
plt.scatter(xs,ys,marker = 'o')
for i, v in enumerate(vocabs):
    plt.annotate(v,xy=(xs[i],ys[i]))

In [None]:
from numpy import dot 
from numpy.linalg import norm 
import numpy as np 
def cos_sim(A,B):
    return dot (A,B)/(norm(A)*norm(B))

doc1=np.array([3,4])
doc2=np.array([-1,2])

print(cos_sim(doc1, doc2))