In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/ALL_DATA_1.csv',encoding = 'cp949')
df2 = pd.DataFrame(df.groupby(['닉네임','제품명'])['평점'].mean())
df2 = df2.reset_index(inplace=False,drop=False)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

# n개 이상의 데이터 리뷰를 남긴 유저, 제품을 추출하는 함수
def preprocessing(data, n):
    min_id = data['닉네임'].value_counts() >= n
    min_id = min_id[min_id].index.to_list()
    data = data[data['닉네임'].isin(min_id)]

    min_beer = data['제품명'].value_counts() >= n
    min_beer = min_beer[min_beer].index.to_list()
    data = data[data['제품명'].isin(min_beer)]

    return data

data = pd.DataFrame()

# 10번 반복합니다.

print("5개 이상의 제품을 구매한 사람의 데이터만 뽑아오기")
print("사용할 수 있는 데이터 개수 (알고리즘에 사용되는 데이터 개수)")
for i in range(1,10):
    data = preprocessing(df2, 5)
    print(data.shape)

5개 이상의 제품을 구매한 사람의 데이터만 뽑아오기
사용할 수 있는 데이터 개수 (알고리즘에 사용되는 데이터 개수)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)
(3564, 3)


###  제품명 컬럼에 공백 제거

In [13]:
data['제품명'] = data['제품명'].str.strip()

### 제품명, 닉네임 간편하게 처리하기 위해 수치 데이터로 인코딩

In [14]:
from sklearn.preprocessing import LabelEncoder
user_Encoder = LabelEncoder()
product_Encoder = LabelEncoder()
user_Encoder.fit(data['닉네임'])
product_Encoder.fit(data['제품명'])

LabelEncoder()

### 인코딩한 데이터들 새로운 컬럼에 담아주기

In [15]:
data['user_id'] = user_Encoder.transform(data['닉네임'])
data['product_id'] = product_Encoder.transform(data['제품명'])

data.head()

Unnamed: 0,닉네임,제품명,평점,user_id,product_id
19,0103****,레츠토이 트럭 잠수함 캐리어 미니카 자동차 장난감 세트,5.0,0,13
20,0103****,스모네오 레고듀플로호환 통큰블록 블럭장난감,4.0,0,24
21,0103****,아이와 트럭모래놀이세트 다양하게 즐기는 장난감 대형구성 야외흙놀이 캠핑놀이 키즈,5.0,0,32
22,0103****,움직이는 강아지 소리나는 장난감 인형,4.5,0,33
23,0103****,캐리와봉봉 장난감 꼬마버스 타요 포코 패트 스피드 빌리 미니카,5.0,0,38


### 필요없는 데이터 삭제

### rating_df(평점 데이터)
###  - 평점,아이디,제품명만 있으면 됨

### product_df(제품 데이터)
###  - 중복 제거한 모든 제품명, 수치로 새롭게 만든 제품ID

In [16]:
rating_df = data.copy()
# del rating_df['닉네임']
del rating_df['제품명']
# del rating_df['product_id']

rating_df.head()

Unnamed: 0,닉네임,평점,user_id,product_id
19,0103****,5.0,0,13
20,0103****,4.0,0,24
21,0103****,5.0,0,32
22,0103****,4.5,0,33
23,0103****,5.0,0,38


In [17]:
product_df = data.drop_duplicates(['제품명'])
del product_df['닉네임']
del product_df['user_id']
del product_df['평점']
product_df.head()

Unnamed: 0,제품명,product_id
19,레츠토이 트럭 잠수함 캐리어 미니카 자동차 장난감 세트,13
20,스모네오 레고듀플로호환 통큰블록 블럭장난감,24
21,아이와 트럭모래놀이세트 다양하게 즐기는 장난감 대형구성 야외흙놀이 캠핑놀이 키즈,32
22,움직이는 강아지 소리나는 장난감 인형,33
23,캐리와봉봉 장난감 꼬마버스 타요 포코 패트 스피드 빌리 미니카,38


In [18]:
product_df.to_excel("Product_name.xlsx",index=None)
rating_df.to_excel("User_rating.xlsx",index=None)

### 피벗 테이블 만들기
### SVD를 사용하여 Matrix Factorization을 진행하기 위함


#### Matrix Factorization 정보
#### https://greeksharifa.github.io/machine_learning/2019/12/20/Matrix-Factorization/

In [19]:
df_user_product_ratings = rating_df.pivot(
    index = 'user_id',
    columns='product_id',
    values = '평점').fillna(0)

In [20]:
df_user_product_ratings.head()

product_id,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# matrix는 pivot_table 값을 numpy matrix로 만든 것
matrix = df_user_product_ratings.values

# user_ratings_mean은 사용자의 평균 평점
user_ratings_mean = np.mean(matrix, axis = 1)

# R_user_mean : 사용자 - 제품에 대해 사용자 평균 평점을 뺀 것
matrix_user_mean = matrix - user_ratings_mean.reshape(-1,1)


In [22]:
pd.DataFrame(matrix_user_mean, columns = df_user_product_ratings.columns)

product_id,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550,...,3.450000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000,-0.550000
1,-1.150000,-1.150000,-1.150000,-1.150000,3.850000,3.850000,-1.150000,-1.150000,-1.150000,-1.150,...,-1.150000,3.850000,-1.150000,3.850000,-1.150000,-1.150000,-1.150000,-1.150000,-1.150000,-1.150000
2,-0.792000,-0.792000,-0.792000,-0.792000,3.208000,-0.792000,-0.792000,-0.792000,-0.792000,-0.792,...,-0.792000,-0.792000,-0.792000,4.208000,-0.792000,-0.792000,-0.792000,-0.792000,-0.792000,-0.792000
3,-0.420000,-0.420000,-0.420000,-0.420000,4.580000,-0.420000,-0.420000,-0.420000,-0.420000,-0.420,...,-0.420000,-0.420000,-0.420000,-0.420000,-0.420000,-0.420000,-0.420000,-0.420000,-0.420000,4.580000
4,-0.650000,-0.650000,-0.650000,-0.650000,4.350000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650,...,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000,-0.650000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,-0.755000,-0.755000,-0.755000,-0.755000,4.245000,-0.755000,-0.755000,-0.755000,-0.755000,-0.755,...,-0.755000,-0.755000,-0.755000,3.995000,-0.755000,-0.755000,-0.755000,-0.755000,-0.755000,-0.755000
438,-0.480000,3.520000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480,...,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000
439,-0.480000,-0.480000,-0.480000,-0.480000,4.520000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480,...,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,-0.480000,4.520000,-0.480000
440,-0.590000,-0.590000,-0.590000,4.410000,4.410000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590,...,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000,-0.590000


In [23]:
from scipy.sparse.linalg import svds, eigs

# scipy에서 제공해주는 svd
# U 행렬, sigma 행렬, V 전치 행렬 반환

U, sigma, Vt = svds(matrix_user_mean, k = 3)

print(U.shape)
print(sigma.shape)
print(Vt.shape)

(442, 3)
(3,)
(3, 50)


In [24]:
sigma = np.diag(sigma)

sigma.shape

(3, 3)

In [25]:
sigma[1]

array([ 0.        , 51.83310196,  0.        ])

In [26]:
svd_user_predicted_ratings = (np.dot(np.dot(U,sigma),Vt) + 
                            user_ratings_mean.reshape(-1,1))

In [27]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, 
            columns = df_user_product_ratings.columns)
df_svd_preds.head()

product_id,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.228684,0.272839,0.157472,0.506121,1.086095,0.407669,0.548088,0.347058,0.286772,1.327936,...,0.60627,0.233629,0.417442,0.790186,0.255661,0.192804,0.227271,0.214006,0.379872,0.40865
1,0.114358,0.599328,0.187658,0.923049,3.696974,0.885956,1.046346,0.551959,0.483713,0.677,...,1.558442,0.314824,0.71593,1.804169,0.447057,0.2179,0.332294,0.295977,0.642386,3.337806
2,-0.174972,-0.009273,-0.164103,0.349154,1.671851,0.546485,0.786119,0.161432,-0.15044,1.048295,...,1.611261,-0.039738,0.45113,1.994968,0.038416,-0.101406,0.051905,-0.022192,0.188566,1.635084
3,-0.215861,0.307025,-0.252231,0.656896,3.682184,0.151205,0.279144,0.125667,0.370256,0.7428,...,-0.077706,-0.118273,0.065121,0.040252,-0.005417,-0.237711,-0.23571,-0.204057,0.271353,2.049134
4,-0.178006,0.265059,-0.325592,0.848226,3.87436,0.256859,0.539408,0.213683,0.3648,2.13037,...,0.19639,-0.121452,0.225521,0.51435,-0.013629,-0.268601,-0.240671,-0.223425,0.37472,1.505801


In [28]:
df_svd_preds.to_excel('prediction_df.xlsx',index=None)

#### 추천 해주는 기능 모아서 함수 만들기

In [29]:
def recommend_product(df_svd_preds, user_id, ori_product_df, ori_ratings_df, 
                     num_recommendations = 3) :
    
    # 현재는 index로 적용이 되어 있으므로 user_id - 1을 해야함
    user_row_number = user_id -1
    
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 제품 데이터 정렬
    # >> 제품 평점이 높은 순으로 정렬된다.
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    print(sorted_user_predictions.head())
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다.
    user_data = ori_ratings_df[ori_ratings_df.user_id == user_id]
    
    # 위에서 뽑은 user data와 원본 제품 데이터를 합친다.
    user_history = user_data.merge(ori_product_df, on = 'product_id').sort_values(['평점'],ascending=False)
    
    # 원본 제품 데이터에서 사용자가 본 제품 데이터를 제외한 데이터를 추출
    recommendations = ori_product_df[~ori_product_df['product_id'].isin(user_history['product_id'])]
    
    # 사용자의 제품 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다.
    
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(),on='product_id')
    recommendations = recommendations.drop_duplicates(['product_id'])
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
    
    return user_history, recommendations
    

# 사용자가 구매한 제품, 사용자에게 추천할 제품 뽑아주는 함수

# 입력값  총 5개
## 1. 모든 유저 아이디, 제품명, 평점  , 이 것을
## Matrix Factorization행렬로 변환 한 거
## 2. 총 제품명 정보 (유니크하게)
## 3. 총 평점 정보 (평점,사용자id,제품명)
## 4. 추천할 제품 개수


In [34]:
already_rated , predictions = recommend_product(df_svd_preds,46,product_df,rating_df,10)

product_id
15    1.528531
24    1.339490
4     1.012462
9     0.801961
13    0.763232
Name: 45, dtype: float64


# 출력 1 - 해당 사용자가 구매한 제품 

In [35]:
already_rated 

Unnamed: 0,닉네임,평점,user_id,product_id,제품명
0,cher****,5.0,46,4,국민 아기 텐트 인디언 유아 어린이 아이 놀이 플레이 하우스 장난감집 아기집
1,cher****,5.0,46,7,나혼자산다 LED 비행기 장난감 360도 회전 부메랑 비행기
6,cher****,5.0,46,24,스모네오 레고듀플로호환 통큰블록 블럭장난감
8,cher****,5.0,46,32,아이와 트럭모래놀이세트 다양하게 즐기는 장난감 대형구성 야외흙놀이 캠핑놀이 키즈
9,cher****,5.0,46,39,크리에이트 로드 레일카 4종 공룡 소방 엘레베이터 중장비 트랙 미니카 장난감
10,cher****,5.0,46,44,탑브라이트 원목 자석낚시놀이 숫자학습 4살 5살 선물 유아 아기 낚시장난감
11,cher****,5.0,46,49,헬로카봇 스타피너 헬로카봇 아이언트 비트런 로드세이버 차탄이네윷놀이 펭토킹 호크블...
3,cher****,4.727273,46,15,말하는 따라쟁이 앵무새 인형 말따라하는 장난감
2,cher****,4.666667,46,12,디폼 블럭 8mm 세트 패키지 도안 소근육 발달 놀이 조립 캐릭터 어린이 유아 만들...
4,cher****,4.666667,46,16,"메이크잇업 13종 디럭스, 유아 화장놀이 세트, 어린이 화장품 장난감"


# 출력 2
# 사용자와 가장 비슷한 유형의 유저들이 산 제품 추천 목록 5개

In [36]:
predictions

Unnamed: 0,제품명,product_id,Predictions
21,너프건 19종 다트 리필 4종 모음 해즈브로 오버워치 엘리트 메가 좀비스트라이크 저격총,9,0.801961
0,레츠토이 트럭 잠수함 캐리어 미니카 자동차 장난감 세트,13,0.763232
13,아기 유아 아동 리얼 미니 쇼핑 카트 장난감,30,0.620055
6,봉봉몬스터 물나오는 싱크대 아이와 주방놀이 설거지 장난감,21,0.580986
1,움직이는 강아지 소리나는 장난감 인형,33,0.520205
17,고고다이노 장난감 렉스 비키 토모 핑 로키 스톰 안키 스피노 케루 플레오 페리 포키...,3,0.432488
8,잘나가는 장난감총 전동건 너프건 총알호환 모음전,35,0.429769
15,붐키즈 원목낚시놀이 아이들의 최애템 장난감 완구 교구 돌 아기 유아 가정보육 집콕놀이,22,0.427005
10,타요 중장비 장난감 자동차 맥스 크리스 포코 패트 미니카,43,0.425114
12,신비아파트 고스트볼 더블x 장난감 피규어 피닉스활검 강림이칼 사토룡 장산범 수상한 의뢰,29,0.405203
