In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
# 확률적 경사하강법을 이용한 행렬 분해

#경사하강법 = '점진적으로'반복적인 계산을 통해 \파라미터 값을 업데이트하면서 오류값이 최소가 되는 \파라미터 구하는 방식
#' 데이러를 기반으로 알고리즘이 스스로학습한다',는 머신러닝의 개념을 가능하게해준 핵심기법

def get_rmse(R, P, Q, non_zeros):             # R= 실제행렬값   p,q = 분해된 행렬값(?)  # get_rms = 실제행렬과 예측 행렬의오차를구하는 함수
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P,Q.T)                                # P와 Q.T를 dot(내적) = 예측 R행렬 
    
    # 실제 R 행렬에서 NULL이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]   # ( non_zeros안에  non_zero를 0을 제외하고 반복적으로 수행)
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]   #(non_zeros안에 non_zero를  0부터 추출한다(?))
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind,y_non_zero_ind] # 예측 R행렬 full_pred_matrix에 (x,y)을 넣고
     
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros) # mse = mse(실제행렬, 예측행렬)
    rmse = np.sqrt(mse)    # rmse 는 mse에 루트
    
    return rmse

In [3]:
# 행렬 분해
# 책에서 이미 설정해놓은것  R  = 사용자-아이템 평점 행렬 k = 잠재요인  learning_rate=학습률 r_lambda = L2규제계수
# matrix_factorization = 행렬 분해 로직 

def matrix_factorization(R, K, steps=100, learning_rate=0.01, r_lambda=0.01):            
    num_users, num_items = R.shape                         # 사용자-아이템 평점 행렬의 컬럼이 num_users, num_items 다(?)
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size = (num_users,K))   #P = 정규분포 (scale=1./K, size = (num_users,K)
    Q = np.random.normal(scale=1./K, size = (num_items,K))    #Q = 정규분포 (scale=1./K, size = (num_users,K)
    
    prev_rmse=10000                   # 설명을 할수가없다 (그냥 대입이라서)
    break_count = 0
    
    # R > 0인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]   # 
    
    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트   SGD(경사하강법)
    for step in range(steps):  # steps는 SGD의 반복횟수      
        for i, j, r in non_zeros:                                       # 
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i,:],Q[j,:].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R,P,Q, non_zeros)
        if ( step % 10) == 0: # 10회 반복할 때마다 오류 값 출력
            print(f'iteration step: {step}, rmse: {rmse}')
    return P, Q

In [6]:
q= pd.read_csv('test21.csv',encoding='cp949')

q = q.set_index('고객번호')
q

Unnamed: 0_level_0,14K,2단우산,3단우산,3분요리류,4대 B/D,5 ON THE GO,ACC Bloom (1F),ACC Bloom (3F),AK골프,ANDZ,...,휴대폰기타용품,휴모니아,휴지류,휴지통,흑미,흑미류,흰다리새우,히터,히터기,힐앤토트
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19382,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
q1 = q.copy()
q1

Unnamed: 0_level_0,14K,2단우산,3단우산,3분요리류,4대 B/D,5 ON THE GO,ACC Bloom (1F),ACC Bloom (3F),AK골프,ANDZ,...,휴대폰기타용품,휴모니아,휴지류,휴지통,흑미,흑미류,흰다리새우,히터,히터기,힐앤토트
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19382,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19383 entries, 1 to 19383
Columns: 3520 entries, 14K to 힐앤토트
dtypes: float64(3520)
memory usage: 520.7 MB


In [6]:
q.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
q.count() #고객수  물품  횟수

14K       19383
2단우산      19383
3단우산      19383
3분요리류     19383
4대 B/D    19383
          ...  
흑미류       19383
흰다리새우     19383
히터        19383
히터기       19383
힐앤토트      19383
Length: 3520, dtype: int64

In [22]:
q.dtypes

14K       float64
2단우산      float64
3단우산      float64
3분요리류     float64
4대 B/D    float64
           ...   
흑미류       float64
흰다리새우     float64
히터        float64
히터기       float64
힐앤토트      float64
Length: 3520, dtype: object

In [8]:
q = q.astype('int64')
q.dtypes

14K       int64
2단우산      int64
3단우산      int64
3분요리류     int64
4대 B/D    int64
          ...  
흑미류       int64
흰다리새우     int64
히터        int64
히터기       int64
힐앤토트      int64
Length: 3520, dtype: object

In [11]:
q =q.replace([np.inf, -np.inf], 0)
q

Unnamed: 0_level_0,14K,2단우산,3단우산,3분요리류,4대 B/D,5 ON THE GO,ACC Bloom (1F),ACC Bloom (3F),AK골프,ANDZ,...,휴대폰기타용품,휴모니아,휴지류,휴지통,흑미,흑미류,흰다리새우,히터,히터기,힐앤토트
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19379,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19382,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
q = q.fillna(0)
q

Unnamed: 0_level_0,14K,2단우산,3단우산,3분요리류,4대 B/D,5 ON THE GO,ACC Bloom (1F),ACC Bloom (3F),AK골프,ANDZ,...,휴대폰기타용품,휴모니아,휴지류,휴지통,흑미,흑미류,흰다리새우,히터,히터기,힐앤토트
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19379,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19382,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
P, Q = matrix_factorization(q.values, K=50, steps=100, learning_rate=0.01,
                           r_lambda = 0.01)
q_matrix = np.dot(P,Q.T)

  Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])
  P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])
  P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])
  Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
q_matrix = pd.DataFrame(data=q_matrix, index=q_matrix.index,
                                  columns = q_matrix.columns)
q_matrix.head(3)

In [None]:
# 사용자가 관람하지 않은 영화
unseen_list = get_unseen_movies(q,9)

# 잠재 요인 협업 필터링
recomm_movies = recomm_movie_by_userid(q_matrix, 9, unseen_list, top_n=10)

# 평점 데이터
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index,
                            columns=['pred_score'])
recomm_movies