## 1. 필요 라이브러리 설치

In [1]:
# 한글 형태소 분석기인 라이노(RHINO) 라이브러리를 사용
!pip install rhinoMorph

Collecting rhinoMorph
  Downloading rhinoMorph-4.0.1.12-py3-none-any.whl (3.0 MB)
Installing collected packages: rhinoMorph
Successfully installed rhinoMorph-4.0.1.12


In [2]:
!pip install JPype1

Collecting JPype1
  Downloading JPype1-1.4.1-cp39-cp39-win_amd64.whl (345 kB)
Installing collected packages: JPype1
Successfully installed JPype1-1.4.1


In [3]:
!pip install scikit-learn



In [193]:
import pandas as pd
import numpy as np
import rhinoMorph
import re
from math import sqrt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

## 2. 데이터 로드

In [4]:
df = pd.read_csv("..data/review_data.csv")

In [5]:
df

Unnamed: 0,id,times,rating,reviews,제품명
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말
...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood


In [7]:
df_all = df.copy()

## 3. 논문(5-1단계) 리뷰 데이터 수집(Data Collecting)
#### 평점이 1점에서 3점의 경우인 리뷰는 0(부정)으로, 4점에서 5점은 1(긍정)로 레이블(Label)을 부여한다.

In [8]:
# 리뷰에 따라 긍부정 라벨링, 함수
def label_review(score):
    if score <= 3:
        return 0
    else:
        return 1

In [9]:
# 라벨링 적용
df_all['label'] = df_all['rating'].apply(label_review)

In [10]:
df_all

Unnamed: 0,id,times,rating,reviews,제품명,label
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말,1
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말,1
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말,1
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말,1
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말,1
...,...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood,0
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood,0
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood,0
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood,0


In [11]:
# 라벨링 결과 확인, 
# 긍정 데이터 약 94% 부정 데이텅 약 6%
df_all['label'].value_counts()

label
1    3799
0    1599
Name: count, dtype: int64

## 4. 논문(5-2단계) 리뷰 데이터 전처리(Preprocessing

In [12]:
# 라이노 형태소 분석기 초기화
rhn = rhinoMorph.startRhino()

# 형태소 분석을 위한 전처리 함수
def preprocess_review(review, rhn):
    # 텍스트 정제, 한글 문자와 공백을 제외한 모든 문자
    cleaned_review = re.sub(r'[^가-힣\s]', '', review)

    # 형태소 분석, 필요한 품사(명사, 동사, 형용사)만 추출
    morphs = rhinoMorph.onlyMorph_list(rhn, cleaned_review, pos=['NNG', 'NNP', 'VV', 'VA'], eomi=True)
    
    return ' '.join(morphs)

filepath:  C:\Users\seung\Anaconda3\envs\datawork\Lib\site-packages
classpath:  C:\Users\seung\Anaconda3\envs\datawork\Lib\site-packages\rhinoMorph/lib/rhino.jar
RHINO started!


In [13]:
# 'review' 컬럼에 형태소 분석 적용
df_all['processed_review'] = df_all['reviews'].apply(lambda x: preprocess_review(x, rhn))

In [14]:
df_all

Unnamed: 0,id,times,rating,reviews,제품명,label,processed_review
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말,1,용량 크다 부담 쓰기 좋다 자극 쓸다 있다 좋다
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말,1,대용량 쓰기 좋다 향이 없다 좋다
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말,1,겨울 스킨로션 단계 쓰다 쓰다 전 속 건조 줄 좋다 오다 흡수 날 되다 같다
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말,1,좋다 말 듣다 쓰다 온천수 그렇다 자극 좋다 양도 많다
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말,1,나오다 뿌리다 미스 트다 대용량 편하다 좋다
...,...,...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood,0,매장 직원 증정품 들다 사다 알다 들다 있다 같다 금액 주다 사다
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood,0,효과 보기 전 향 속 좋다 아지 향 민감 한편 알다 맞다
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood,0,패드 두껍다 붙다 느낌 얹다 놓다 같다 느낌 요하 쓰다 뾰루지
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood,0,기대다 쓰다 얼굴 간지럽다 느끼다 다음 얼굴 뒤집다 용량 크다 아깝다 못쓰다 성분 보다


## 5. 논문(5-3단계) 사전 구축(Make a dictionary)

In [18]:
# 사용자 정의 불용어 목록
# custom_stop_words = ['아','휴','아이구','아이쿠','아이고','어','나','우리','저희','따라','의해','을','를','에','의','가','으로','로','에게','뿐이다','의거하여','근거하여','입각하여','기준으로','예하면','예를 들면','예를 들자면','저','소인','소생','저희','지말고','하지마','하지마라','다른','물론','또한','그리고','비길수 없다','해서는 안된다','뿐만 아니라','만이 아니다','만은 아니다','막론하고','관계없이','그치지 않다','그러나','그런데','하지만','든간에','논하지 않다','따지지 않다','설사','비록','더라도','아니면','만 못하다','하는 편이 낫다','불문하고','향하여','향해서','향하다','쪽으로','틈타','이용하여','타다','오르다','제외하고','이 외에','이 밖에','하여야','비로소','한다면 몰라도','외에도','이곳','여기','부터','기점으로','따라서','할 생각이다','하려고하다','이리하여','그리하여','그렇게 함으로써','하지만','일때','할때','앞에서','중에서','보는데서','으로써','로써','까지','해야한다','일것이다','반드시','할줄알다','할수있다','할수있어','임에 틀림없다','한다면','등','등등','제','겨우','단지','다만','할뿐','딩동','댕그','대해서','대하여','대하면','훨씬','얼마나','얼마만큼','얼마큼','남짓','여','얼마간','약간','다소','좀','조금','다수','몇','얼마','지만','하물며','또한','그러나','그렇지만','하지만','이외에도','대해 말하자면','뿐이다','다음에','반대로','반대로 말하자면','이와 반대로','바꾸어서 말하면','바꾸어서 한다면','만약','그렇지않으면','까악','툭','딱','삐걱거리다','보드득','비걱거리다','꽈당','응당','해야한다','에 가서','각','각각','여러분','각종','각자','제각기','하도록하다','와','과','그러므로','그래서','고로','한 까닭에','하기 때문에','거니와','이지만','대하여','관하여','관한','과연','실로','아니나다를가','생각한대로','진짜로','한적이있다','하곤하였다','하','하하','허허','아하','거바','와','오','왜','어째서','무엇때문에','어찌','하겠는가','무슨','어디','어느곳','더군다나','하물며','더욱이는','어느때','언제','야','이봐','어이','여보시오','흐흐','흥','휴','헉헉','헐떡헐떡','영차','여차','어기여차','끙끙','아야','앗','아야','콸콸','졸졸','좍좍','뚝뚝','주룩주룩','솨','우르르','그래도','또','그리고','바꾸어말하면','바꾸어말하자면','혹은','혹시','답다','및','그에 따르는','때가 되어','즉','지든지','설령','가령','하더라도','할지라도','일지라도','지든지','몇','거의','하마터면','인젠','이젠','된바에야','된이상','만큼\t어찌됏든','그위에','게다가','점에서 보아','비추어 보아','고려하면','하게될것이다','일것이다','비교적','좀','보다더','비하면','시키다','하게하다','할만하다','의해서','연이서','이어서','잇따라','뒤따라','뒤이어','결국','의지하여','기대여','통하여','자마자','더욱더','불구하고','얼마든지','마음대로','주저하지 않고','곧','즉시','바로','당장','하자마자','밖에 안된다','하면된다','그래','그렇지','요컨대','다시 말하자면','바꿔 말하면','즉','구체적으로','말하자면','시작하여','시초에','이상','허','헉','허걱','바와같이','해도좋다','해도된다','게다가','더구나','하물며','와르르','팍','퍽','펄렁','동안','이래','하고있었다','이었다','에서','로부터','까지','예하면','했어요','해요','함께','같이','더불어','마저','마저도','양자','모두','습니다','가까스로','하려고하다','즈음하여','다른','다른 방면으로','해봐요','습니까','했어요','말할것도 없고','무릎쓰고','개의치않고','하는것만 못하다','하는것이 낫다','매','매번','들','모','어느것','어느','로써','갖고말하자면','어디','어느쪽','어느것','어느해','어느 년도','라 해도','언젠가','어떤것','어느것','저기','저쪽','저것','그때','그럼','그러면','요만한걸','그래','그때','저것만큼','그저','이르기까지','할 줄 안다','할 힘이 있다','너','너희','당신','어찌','설마','차라리','할지언정','할지라도','할망정','할지언정','구토하다','게우다','토하다','메쓰겁다','옆사람','퉤','쳇','의거하여','근거하여','의해','따라','힘입어','그','다음','버금','두번째로','기타','첫번째로','나머지는','그중에서','견지에서','형식으로 쓰여','입장에서','위해서','단지','의해되다','하도록시키다','뿐만아니라','반대로','전후','전자','앞의것','잠시','잠깐','하면서','그렇지만','다음에','그러한즉','그런즉','남들','아무거나','어찌하든지','같다','비슷하다','예컨대','이럴정도로','어떻게','만약','만일','위에서 서술한바와같이','인 듯하다','하지 않는다면','만약에','무엇','무슨','어느','어떤','아래윗','조차','한데','그럼에도 불구하고','여전히','심지어','까지도','조차도','하지 않도록','않기 위하여','때','시각','무렵','시간','동안','어때','어떠한','하여금','네','예','우선','누구','누가 알겠는가','아무도','줄은모른다','줄은 몰랏다','하는 김에','겸사겸사','하는바','그런 까닭에','한 이유는','그러니','그러니까','때문에','그','너희','그들','너희들','타인','것','것들','너','위하여','공동으로','동시에','하기 위하여','어찌하여','무엇때문에','붕붕','윙윙','나','우리','엉엉','휘익','윙윙','오호','아하','어쨋든','만 못하다\t하기보다는','차라리','하는 편이 낫다','흐흐','놀라다','상대적으로 말하자면','마치','아니라면','쉿','그렇지 않으면','그렇지 않다면','안 그러면','아니었다면','하든지','아니면','이라면','좋아','알았어','하는것도','그만이다','어쩔수 없다','하나','일','일반적으로','일단','한켠으로는','오자마자','이렇게되면','이와같다면','전부','한마디','한항목','근거로','하기에','아울러','하지 않도록','않기 위해서','이르기까지','이 되다','로 인하여','까닭으로','이유만으로','이로 인하여','그래서','이 때문에','그러므로','그런 까닭에','알 수 있다','결론을 낼 수 있다','으로 인하여','있다','어떤것','관계가 있다','관련이 있다','연관되다','어떤것들','에 대해','이리하여','그리하여','여부','하기보다는','하느니','하면 할수록','운운','이러이러하다','하구나','하도다','다시말하면','다음으로','에 있다','에 달려 있다','우리','우리들','오히려','하기는한데','어떻게','어떻해','어찌됏어','어때','어째서','본대로','자','이','이쪽','여기','이것','이번','이렇게말하자면','이런','이러한','이와 같은','요만큼','요만한 것','얼마 안 되는 것','이만큼','이 정도의','이렇게 많은 것','이와 같다','이때','이렇구나','것과 같이','끼익','삐걱','따위','와 같은 사람들','부류의 사람들','왜냐하면','중의하나','오직','오로지','에 한하다','하기만 하면','도착하다','까지 미치다','도달하다','정도에 이르다','할 지경이다','결과에 이르다','관해서는','여러분','하고 있다','한 후','혼자','자기','자기집','자신','우에 종합한것과같이','총적으로 보면','총적으로 말하면','총적으로','대로 하다','으로서','참','그만이다','할 따름이다','쿵','탕탕','쾅쾅','둥둥','봐','봐라','아이야','아니','와아','응','아이','참나','년','월','일','령','영','일','이','삼','사','오','육','륙','칠','팔','구','이천육','이천칠','이천팔','이천구','하나','둘','셋','넷','다섯','여섯','일곱','여덟','아홉','령','영']

In [19]:
# TF-IDF 벡터화, custom 불용어 적용
# tfidf_vectorizer = TfidfVectorizer(stop_words=custom_stop_words)
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df_all['processed_review'])

In [20]:
# 고유 단어 목록 가져오기
feature_names = tfidf_vectorizer.get_feature_names_out()

# DataFrame으로 변환
df_tfidf = pd.DataFrame(X.toarray(), columns=feature_names)

# 단어 매트릭스 시각화 확인
df_tfidf

Unnamed: 0,가게,가격,가격대,가까이,가깝다,가꾸다,가너,가늘다,가능,가능성,...,희열감,흰둥이,흰색,히알루론산,히터,히팅,힐러,힘들다,힘없다,힘입다
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# 원본 리뷰 텍스트와 라벨을 함께 분리
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df_all['processed_review'], df_all['label'], test_size=0.2, random_state=42)

In [22]:
# TF-IDF 벡터화 (훈련 데이터에만 fit_transform 적용)
X_train = tfidf_vectorizer.fit_transform(X_train_texts)
X_test = tfidf_vectorizer.transform(X_test_texts)

In [23]:
# 교차 검증을 위한 파라미터 범위 설정
alphas = np.logspace(-6, 6, 13)

## 6. 모델링 (릿지, 라쏘, 엘라스틱넷 회귀)

In [24]:
# 릿지 회귀
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train, y_train)

In [25]:
# 라쏘 회귀
lasso_cv = LassoCV(alphas=alphas, cv=5)
lasso_cv.fit(X_train, y_train)

In [26]:
# 엘라스틱넷
elastic_cv = ElasticNetCV(alphas=alphas, cv=5)
elastic_cv.fit(X_train, y_train)

In [27]:
# TF-IDF 벡터화 과정에서 생성된 고유 단어 목록을 가져옵니다.
feature_names = tfidf_vectorizer.get_feature_names_out()

In [39]:
# 감정 점수 계산 함수
def calculate_sentiment_score(model_coefficients, feature_names, text):
    score = 0
    words = text.split()
    for word in words:
        if word in feature_names:
            index = np.where(feature_names == word)[0][0]
            score += model_coefficients[index]
    return score

In [40]:
# 모델 성능 평가 함수
def evaluate_model_sentiment(model, X_texts, y, feature_names):
    test_scores = [calculate_sentiment_score(model.coef_, feature_names, text) 
                   for text in X_texts]
    predicted_labels = [1 if score > 0 else 0 for score in test_scores]
    accuracy = accuracy_score(y, predicted_labels)
    return accuracy

## 7. 모델 성능 평가

In [41]:
# 릿지 모델 평가
ridge_test_accuracy = evaluate_model_sentiment(ridge_cv, X_test_texts, y_test, feature_names)
print(f"논문 방식 - 릿지 회귀 정확도: {ridge_test_accuracy}")

논문 방식 - 릿지 회귀 정확도: 0.8666666666666667


In [31]:
# 라쏘 모델 평가
lasso_test_accuracy = evaluate_model_sentiment(lasso_cv, X_test_texts, y_test, feature_names)
print(f"논문 방식 - 라쏘 회귀 정확도: {lasso_test_accuracy}")

논문 방식 - 라쏘 회귀 정확도: 0.8601851851851852


In [32]:
# 엘라스틱넷 모델 평가
elastic_test_accuracy = evaluate_model_sentiment(elastic_cv, X_test_texts, y_test, feature_names)
print(f"논문 방식 - 엘라스틱넷 정확도: {elastic_test_accuracy}")

논문 방식 - 엘라스틱넷 정확도: 0.8648148148148148


## 8. 감정사전 추출

In [42]:
# 감정사전 추출 함수
def extract_sentiment_dictionary(model, feature_names):
    coefficients = model.coef_
    positive_words = feature_names[coefficients > 0]
    negative_words = feature_names[coefficients < 0]
    return positive_words, negative_words

### 1) 릿지 모델 감정사전 

In [43]:
# 릿지 모델 감정사전 추출
positive_words_ridge, negative_words_ridge = extract_sentiment_dictionary(ridge_cv, feature_names)
print("릿지 회귀 - 긍정 사전:", positive_words_ridge)
print("릿지 회귀 - 부정 사전:", negative_words_ridge)

릿지 회귀 - 긍정 사전: ['가격' '가격대' '가까이' ... '히알루론산' '히터' '히팅']
릿지 회귀 - 부정 사전: ['가너' '가늘다' '가능성' ... '흰색' '힘들다' '힘입다']


In [44]:
# 긍정 단어 개수
len(positive_words_ridge)

2486

In [45]:
# 부정 단어 개수
len(negative_words_ridge)

2301

### 2) 라쏘 모델 감정사전 

In [46]:
positive_words_lasso, negative_words_lasso = extract_sentiment_dictionary(lasso_cv, feature_names)
print("라쏘 회귀 - 긍정 사전:", positive_words_lasso)
print("라쏘 회귀 - 부정 사전:", negative_words_lasso)

라쏘 회귀 - 긍정 사전: ['가격' '가격대' '가능' '가라앉다' '가볍다' '가성' '가을' '갈색' '감도' '강추' '같다' '거의' '건조' '겨울'
 '겨울철' '계절' '고민' '곱다' '공병' '관리하다' '괜찮다' '구달' '구매' '구입' '궁금하다' '귀엽다' '그레이'
 '기간' '기대중' '기본' '기초' '기획' '긷다' '까다' '끄다' '끈적거리다' '끈적이다' '나누다' '날씨' '남다'
 '내추럴' '넘어오다' '높다' '늘다' '다가오다' '다음' '닥터' '단계' '단종' '달다' '당근' '당기다' '대박'
 '대신' '데일리' '도움' '독도' '돌다' '돌아오다' '되다' '두다' '두통' '드라마틱' '드릴' '들다' '따끔거리다'
 '따다' '따르다' '뜨거워지다' '라이브' '레티' '롤링' '리본' '마녀' '마스카라' '마음' '막이' '만족' '많다'
 '말해' '매다' '매일' '맨들다' '먹다' '메이크업' '멜론' '모공' '모으다' '무겁다' '무디다' '물기' '묽다'
 '미세' '미치다' '믿다' '바르다' '바이오' '발라' '발레리나' '변하다' '보습' '보습력' '보이다' '부결'
 '부드럽다' '분위기' '붙이다' '블러' '비우다' '비타민' '빠르다' '빨리다' '뽀얗다' '사계절' '사고' '사라지다'
 '사랑하다' '사용' '새다' '샘물' '샤워' '선물' '세럼' '세수' '세안' '세일' '세트' '소량' '소문나다' '속다'
 '속보' '수분' '순하다' '스미다' '스크럽' '스킨' '스트레스' '신기다' '싫어하다' '써보다' '썸머' '쏘다' '쓰기'
 '쓰다' '쓰리다' '쓸다' '아이템' '아지' '아침' '아토피' '아프다' '안다' '애교' '애정' '앰플' '약간'
 '어머니' '어용' '언니' '얹다' '얼마' '엄다' '엄마' '에스' '영세일' '영양' '예쁘다' '예정' '오래가다'
 '오일' '올라가다' '올리다' '요즘' '욕실' '용도' '용량' 

In [47]:
# 긍정 단어 개수
len(positive_words_lasso)

287

In [48]:
#부정 단어 개수
len(negative_words_lasso)

292

### 3) 엘라스틱넷 모델 감정사전 

In [49]:
positive_words_elastic, negative_words_elastic = extract_sentiment_dictionary(elastic_cv, feature_names)
print("엘라스틱넷 - 긍정 사전:", positive_words_elastic)
print("엘라스틱넷 - 부정 사전:", negative_words_elastic)

엘라스틱넷 - 긍정 사전: ['가격' '가격대' '가깝다' '가능' '가대' '가라앉다' '가리다' '가볍다' '가성' '가스' '가을' '가족' '간지'
 '간편' '갈라지다' '갈색' '갈웜' '감도' '강추' '같다' '개다' '거리다' '거부감' '거의' '걱정' '건아' '건조'
 '걸다' '겨울' '겨울철' '계절' '고르다' '고맙다' '고민' '고민중' '고민하' '고보' '고입' '고장' '고정'
 '고통' '골다' '곱다' '공병' '관리하다' '괜찮다' '구달' '구매' '구매사' '구입' '굵다' '궁금하다' '궁합'
 '귀여움' '귀엽다' '그레이' '그립' '극소량' '글로우' '긋다' '기간' '기대중' '기본' '기분' '기억' '기존'
 '기준' '기초' '기회' '기획' '긷다' '김미연' '까다' '까다롭다' '꺼내다' '꼽다' '꾸다' '끄다' '끈적거리다'
 '끈적이다' '끝판' '나누다' '나아지다' '나이' '나이트' '나타나다' '날씨' '남다' '남자' '남편' '내추럴'
 '냉장고' '넘어가다' '넘어오다' '넘치다' '녹색' '높다' '높이다' '누다' '누드' '눈앞' '눕다' '늘다' '니드'
 '니트' '다가오다' '다발' '다양' '다음' '다음날' '다음번' '다이' '다이어트' '닥터' '닦이다' '단계' '단골'
 '단점' '단종' '달다' '달라다' '담날' '담다' '당근' '당기다' '대박' '대신' '덜다' '데일리' '도드라지다'
 '도움' '독도' '독하다' '돌다' '돌아오다' '되다' '두께' '두다' '두들기다' '두통' '드라마틱' '드릴' '들다'
 '따끔거리다' '따다' '따르다' '딸리다' '때다' '뜨거워지다' '라벤더' '라이브' '레트' '레티' '롤링' '루틴'
 '리본' '마구' '마녀' '마스카라' '마음' '막이' '만그' '만족' '만지다' '만하다' '많다' '말리다' '말해'
 '맑다' '맛사지' '망설이다' '매다' '매일' '맨들다' '먹다' '메이크업' 

In [50]:
# 긍정 단어 개수
len(positive_words_elastic)

548

In [51]:
# 부정 단어 개수
len(negative_words_elastic)

585

## 9. 논문(7단계 평점 예측) 베이스 사용자 기반 협업 필터링(User-Based Collaborative Filtering, UBCF) 구축

### 논문 추천 시스템 및 검증 방법 설명 : 논문에서는 크게 추천 시스템을 4가지(사용자기반, 아이템기반, SVD, SVD++ ) 구축하여 비교분석 하고 있습니다.

- 사용자 기반 협업 필터링 (User-based Collaborative Filtering, UBCF): 이 모델은 사용자가 부여한 평점 정보를 이용하여 사용자와 유사한 성향을 갖는 이웃 사용자를 선별합니다. 그런 다음, 이 이웃들이 공통적으로 선호하는 아이템을 해당 사용자에게 추천합니다​​.

- 아이템 기반 협업 필터링 (Item-based Collaborative Filtering, IBCF): 이 방식은 특정 아이템을 기준으로 하여 사용자들에 의해 평가된 점수가 유사한 다른 아이템을 찾아내고, 이 정보를 바탕으로 사용자가 특정 아이템에 대해 가질 것으로 예상되는 평점을 예측합니다.

- Matrix Factorization 기반 모델 (SVD, SVD++): 이 모델들은 Matrix Factorization을 사용하여 사용자와 아이템 간의 관계를 분해하고 예측 평점을 계산합니다. SVD(Singular Value Decomposition)와 SVD++는 이 분야에서 대표적으로 사용되는 알고리즘입니다​

이후 (네가지 모델의 성능) vs (네가지 [모델 + 감정점수 반영] 의 성능) 을 비교분석 합니다.

아래 코드는 네가지 모델중 (사용자기반 협업필터링), (아이템기반 협업필터링) 모델과 감정점수를 반영한 모델의 성능차이를 비교하는 코드입니다.

결과는 논문과는 다르게, 감정점수를 반영한 모델의 성능이 더 낮게 나왔습니다.

이는 평점이 1과 5에 극단적으로 치우쳐져있는 화장품리뷰의 특성때문일 가능성이 높아보이며, 

아래 코드중 사용자가 구매한 제품수의 분포를 보면 데이터의 약 90%(3829명)이 제품을 한개만 구매한것으로 나타납니다. 

이러한 데이터 분포로 협업필터링이 제 기능을 충분히 발휘하지 못하고 있을 가능성이 큽니다.

또한 해당 모델이 기본적인 협업필터링 모델이기 때문에 모델의 알고리즘 고도화 방법에따라 결과가 다소 달라질 수 있어보입니다.

In [135]:
df

Unnamed: 0,id,times,rating,reviews,제품명
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말
...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood


In [136]:
# 사용자당 구매한 제품 수
product_user_counts = df.groupby('id')['제품명'].nunique()
product_user_counts

id
00seoyeo****    1
010****         1
01068892****    1
012na****       1
017****         2
               ..
힌동이             1
힌티              1
힐라리아점           2
힐링하기            1
힝구힝구            1
Name: 제품명, Length: 4189, dtype: int64

In [137]:
# 사용자가 구매한 제품수의 분포
user_count_distribution = product_user_counts.value_counts()
user_count_distribution

제품명
1    3829
2     308
3      47
4       4
5       1
Name: count, dtype: int64

In [138]:
# 평점 행렬 생성
rating_matrix = df.pivot_table(values='rating', index='id', columns='제품명', fill_value=0)

In [139]:
rating_matrix

제품명,bioderma2,cosrx,dalba,drg,estilauder,estora,estra2,goodal,numbersin,pisiogel,...,넘버즈인 1번 말끔 순삭 클렌징오일,닥터지 브라이트닝 필링젤,데이지크 섀도우팔레트,마녀공장 퓨어클렌징 오일,바닐라코 클린잇 제로,쏘내추럴 올 데이 메이크업 픽서,아벤느 오 떼르말,에뛰드 컬 픽스 마스카라,정샘물 에센셜 스킨 누더 쿠션,크리니크 치크 팝
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00seoyeo****,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
010****,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01068892****,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
012na****,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
017****,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힌동이,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
힌티,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
힐라리아점,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
힐링하기,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
def pearson_similarity(matrix):
    # 사용자 간 유사도 계산
    sim_matrix = np.corrcoef(matrix)
    np.fill_diagonal(sim_matrix, 0)  # 자기 자신과의 유사도는 0으로 설정
    return sim_matrix

In [141]:
similarity_matrix = pearson_similarity(rating_matrix)

In [142]:
print("새로운 유사도 행렬 크기:", similarity_matrix.shape)

새로운 유사도 행렬 크기: (4189, 4189)


In [180]:
# 평점 예측
def predict_rating(user_index, product_index, matrix, similarity):
    # 유저 인덱스와 제품 인덱스가 범위 내에 있는지 확인
    if user_index >= matrix.shape[0] or product_index >= matrix.shape[1]:
        return 0  # 범위를 벗어나는 인덱스에 대해 0을 반환

    user_ratings = matrix[user_index, :]
    sim_scores = similarity[user_index]  # 유사도 점수 수정
    non_zero_indices = user_ratings.nonzero()[0]
    non_zero_ratings = user_ratings[non_zero_indices]
    non_zero_similarities = sim_scores[non_zero_indices]

    if non_zero_similarities.sum() == 0:
        return 0
    return (non_zero_ratings * non_zero_similarities).sum() / non_zero_similarities.sum()

In [144]:
# RMSE 계산
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [145]:
# 실제 평점과 예측 평점 비교
actual = []
predicted = []

for user_index in range(rating_matrix.shape[0]):
    for product_index in range(rating_matrix.shape[1]):
        actual_rating = rating_matrix.iloc[user_index, product_index]
        if actual_rating != 0:  # 실제 평점이 있는 경우에만 비교
            actual.append(actual_rating)
            predicted_rating = predict_rating(user_index, product_index, rating_matrix.values, similarity_matrix)
            predicted.append(predicted_rating)

In [146]:
# RMSE 계산 및 출력
error = rmse(actual, predicted)
print(f'RMSE: {error}')

RMSE: 0.38798160741665655


In [None]:
# MAE 계산 및 출력
mae_error = mean_absolute_error(actual, predicted)
print(f'MAE: {mae_error}')

## 10. 논문(7단계 평점 예측) 베이스 사용자 기반 협업 필터링(User-Based Collaborative Filtering, UBCF) + 감정점수 반영 구축

In [163]:
df_ubcf = df.copy()

### 9.1) 논문(6단계 감정수치를 반영한 평점)


In [189]:
# 9.1) 논문(6단계 감정수치를 반영한 평점)
df_ubcf['sentiment_score'] = df_ubcf['reviews'].apply(lambda x: calculate_sentiment_score(ridge_cv.coef_, feature_names, x))

In [165]:
df_ubcf

Unnamed: 0,id,times,rating,reviews,제품명,sentiment_score
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말,0.372447
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말,0.177284
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말,-0.106922
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말,-0.124151
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말,0.068373
...,...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood,-0.528820
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood,0.000000
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood,0.000000
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood,-0.492532


In [166]:
# 평점과 감정 점수 결합 (가중치는 예시로 1을 사용)
df_ubcf['adjusted_rating'] = df_ubcf['rating'] + (df_ubcf['sentiment_score'] * 1)

In [167]:
df_ubcf

Unnamed: 0,id,times,rating,reviews,제품명,sentiment_score,adjusted_rating
0,인누,2023.12.05,5,용량이 커서 부담없이 쓰기 좋아요. 자극 없이 쓸 수 있어서 좋습니다. 추천합니다.,아벤느 오 떼르말,0.372447,5.037245
1,얼라이부,2023.12.03,5,대용량이라서 막 쓰기 좋고 무엇보다 향이 없어서 제일 좋아요!,아벤느 오 떼르말,0.177284,5.017728
2,양송이캐기,2023.12.03,5,겨울이라 스킨로션 전 단계에서 써주고 있습니다쓰기 전 보다 속 건조가 줄어서 좋아오...,아벤느 오 떼르말,-0.106922,4.989308
3,핫도그7385,2023.12.01,5,좋다는 말 많이 들어서 써봤는데 온천수리 그런지 자극없이 너무 좋아요 양도 많고요...,아벤느 오 떼르말,-0.124151,4.987585
4,영팡,2023.11.30,5,세수하고 나오면 제일 먼저 뿌리는 미스트에요대용량 너무 편하고 좋아요,아벤느 오 떼르말,0.068373,5.006837
...,...,...,...,...,...,...,...
5393,mj0****,2021.05.10,1,매장 직원분이 증정품 들어있는 거라길래 샀는데 안 들어있음 ㅋㅋㅋ 그럼 굳이 같은 ...,skinfood,-0.528820,0.947118
5394,ciel****,2021.12.14,1,효과도 보기전에 향때문에 속이 안좋아지네요 향에 민감한편 아닌데도 저랑은 정말 안...,skinfood,0.000000,1.000000
5395,kd****,2021.09.16,1,패드가 너무 두꺼워서 얆게 착 붙어있는 느낌이 아니라 뭔가 얹어놓은것 같은 느낌이에...,skinfood,0.000000,1.000000
5396,thwldus****,2021.08.01,1,사실 기대하고 구매했는데 쓰자마자 얼굴 화끈거림과 간지러움이 느껴졌어요. 그리고 다...,skinfood,-0.492532,0.950747


In [168]:
# 새로운 평점을 사용하여 평점 행렬 생성
adjusted_rating_matrix = df_ubcf.pivot_table(values='adjusted_rating', index='id', columns='제품명', fill_value=0)

In [169]:
adjusted_similarity_matrix = pearson_similarity(adjusted_rating_matrix)

In [170]:
# 실제 평점과 예측 평점 비교
adjusted_actual = []
adjusted_predicted = []

for user_index in range(adjusted_rating_matrix.shape[0]):
    for product_index in range(adjusted_rating_matrix.shape[1]):
        actual_rating = adjusted_rating_matrix.iloc[user_index, product_index]
        if actual_rating != 0:  # 실제 평점이 있는 경우에만 비교
            adjusted_actual.append(actual_rating)
            predicted_rating = predict_rating(user_index, product_index, adjusted_rating_matrix.values, adjusted_similarity_matrix)
            adjusted_predicted.append(predicted_rating)

In [175]:
# RMSE 계산 및 출력
error = rmse(actual, predicted)
# MAE 계산 및 출력
mae_error = mean_absolute_error(actual, predicted)

print(f'사용자 기반 협업필터링 모델 RMSE: {error}')
print(f'사용자 기반 협업필터링 모델 MAE: {mae_error}')

사용자 기반 협업필터링 모델 RMSE: 0.38798160741665655
사용자 기반 협업필터링 모델 MAE: 0.06296729431690172


In [176]:
# RMSE 계산 및 출력
error = rmse(adjusted_actual, adjusted_predicted)
# MAE 계산 및 출력
mae_error = mean_absolute_error(adjusted_actual, adjusted_predicted)

print(f'사용자 기반 협업필터링 모델 + 감정점수 반영 RMSE: {error}')
print(f'사용자 기반 협업필터링 모델 + 감정점수 반영 MAE: {mae_error}')

사용자 기반 협업필터링 모델 + 감정점수 반영 RMSE: 0.38970550970460177
사용자 기반 협업필터링 모델 + 감정점수 반영 MAE: 0.06536649098492607


## 11. 논문(7단계 평점 예측) 베이스 아이템 기반 협업 필터링 (Item-based Collaborative Filtering, IBCF) 구축

In [190]:
df_ibcf = df.copy()

In [191]:
# 아이템 기반 평점 행렬 생성
item_user_matrix = df_ibcf.pivot_table(values='rating', index='제품명', columns='id', fill_value=0)

In [192]:
item_user_matrix

id,00seoyeo****,010****,01068892****,012na****,017****,01KJY,07서연,081****,0p4r****,0yoong****,...,히희히히,히히님,히히랠ㄹ려,히히힣히이,히힣,힌동이,힌티,힐라리아점,힐링하기,힝구힝구
제품명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bioderma2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cosrx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
dalba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
drg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estilauder,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estora,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estra2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
goodal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
numbersin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pisiogel,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [194]:
# 아이템 간 코사인 유사도 계산
item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

In [200]:
# 평점 예측 함수
def predict_ratings_ibcf(user_id, item_similarity, item_user_matrix):
    user_ratings = item_user_matrix.loc[:, user_id]
    prediction = pd.Series(0, index=item_user_matrix.index)

    for item, rating in user_ratings.items():
        prediction += item_similarity[item] * rating  # 유사도와 사용자 평점을 곱하여 합산
    
    prediction /= item_similarity.sum(axis=1)  # 정규화
    return prediction

In [201]:
# 모든 사용자에 대한 평점 예측
predicted_ratings = pd.DataFrame(index=item_user_matrix.index, columns=item_user_matrix.columns)
for user_id in item_user_matrix.columns:
    predicted_ratings[user_id] = predict_ratings_ibcf(user_id, item_similarity_df, item_user_matrix)

In [202]:
# RMSE 및 MAE 계산
actual = item_user_matrix.values.flatten()
predicted = predicted_ratings.values.flatten()

rmse = sqrt(mean_squared_error(actual, predicted))
mae = mean_absolute_error(actual, predicted)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

RMSE: 0.1848710252299818
MAE: 0.0659201732992044


## 11. 논문(7단계 평점 예측) 베이스 아이템 기반 협업 필터링 (Item-based Collaborative Filtering, IBCF) + 감정점수 반영 구축

In [222]:
df_ibcf_sentiment = df.copy()

In [223]:
# 9.1) 논문(6단계 감정수치를 반영한 평점)
df_ibcf_sentiment['sentiment_score'] = df_ibcf_sentiment['reviews'].apply(lambda x: calculate_sentiment_score(ridge_cv.coef_, feature_names, x))

In [224]:
# 평점과 감정 점수 결합 (가중치는 예시로 1을 사용)
df_ibcf_sentiment['adjusted_rating'] = df_ibcf_sentiment['rating'] + (df_ibcf_sentiment['sentiment_score'] * 1)

In [225]:
# 아이템 기반 평점 행렬 생성
adjusted_item_user_matrix = df_ibcf_sentiment.pivot_table(values='adjusted_rating', index='제품명', columns='id', fill_value=0)

In [226]:
adjusted_item_user_matrix

id,00seoyeo****,010****,01068892****,012na****,017****,01KJY,07서연,081****,0p4r****,0yoong****,...,히희히히,히히님,히히랠ㄹ려,히히힣히이,히힣,힌동이,힌티,힐라리아점,힐링하기,힝구힝구
제품명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bioderma2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cosrx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.426049
dalba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
drg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estilauder,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estora,0.0,1.05072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
estra2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
goodal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.355645,0.0,0.0,0.0
numbersin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,6.930403,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pisiogel,0.0,0.0,0.0,0.0,5.0,5.050255,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.949918,0.0,0.0,0.0,0.0,0.0


In [227]:
# 아이템 간 코사인 유사도 계산
adjusted_item_similarity = cosine_similarity(adjusted_item_user_matrix)
adjusted_item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

In [228]:
# 평점 예측 함수
def predict_ratings_ibcf(user_id, item_similarity, item_user_matrix):
    user_ratings = item_user_matrix.loc[:, user_id]
    prediction = pd.Series(0, index=item_user_matrix.index)

    for item, rating in user_ratings.items():
        prediction += item_similarity[item] * rating  # 유사도와 사용자 평점을 곱하여 합산
    
    prediction /= item_similarity.sum(axis=1)  # 정규화
    return prediction

In [230]:
# 모든 사용자에 대한 평점 예측
adjusted_predicted_ratings = pd.DataFrame(index=adusted_item_user_matrix.index, columns=adjusted_item_user_matrix.columns)
for user_id in adusted_item_user_matrix.columns:
    adjusted_predicted_ratings[user_id] = predict_ratings_ibcf(user_id, adjusted_item_similarity_df, adjusted_item_user_matrix)

In [232]:
# RMSE 및 MAE 계산
adjusted_actual = adusted_item_user_matrix.values.flatten()
adjusted_predicted = adjusted_predicted_ratings.values.flatten()

adjusted_rmse = sqrt(mean_squared_error(adjusted_actual, adjusted_predicted))
adjusted_mae = mean_absolute_error(adjusted_actual, adjusted_predicted)

print(f'RMSE: {adjusted_rmse}')
print(f'MAE: {adjusted_mae}')

RMSE: 0.19431834672873172
MAE: 0.06881527868481936
