In [None]:
# events.csv 파일 사용해서 추천시스템 만들기
# - event의 값에 따라 점수 부여 체계 만들기 포함 ( 구현자 선택 )
# - SVD 또는 KNN~ 모델 사용

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader

In [2]:
events = pd.read_csv('data-files/events.csv')

In [3]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [5]:
print(events['event'].isna().sum())
events['event'].value_counts()

0


event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64

In [6]:
events['event_score'] = events['event'].map(lambda v: 1 if v == 'view' 
                                                        else 3 if v == 'addtocart' 
                                                               else 5 if v == 'transaction' else 0)

# events['event_score'] = events['event'].map({ 'view': 1, 'addtocart': 3, 'transaction': 5 })

In [7]:
events['event_score'].value_counts()

event_score
1    2664312
3      69332
5      22457
Name: count, dtype: int64

In [9]:
# 같은 사용자와 같은 아이템에 대해 여러 건의 이벤트 발생 확인
dup_check_data = events.groupby(["visitorid", "itemid"])['event'].count()
(dup_check_data > 1).sum()

333038

In [10]:
grouped_events = events.groupby(["visitorid", "itemid"])['event_score'].sum()
grouped_events.head()

visitorid  itemid
0          67045     1
           285930    1
           357564    1
1          72028     1
2          216305    2
Name: event_score, dtype: int64

In [11]:
grouped_events2 = grouped_events.reset_index()
grouped_events2.head()

Unnamed: 0,visitorid,itemid,event_score
0,0,67045,1
1,0,285930,1
2,0,357564,1
3,1,72028,1
4,2,216305,2


In [13]:
print( grouped_events2['event_score'].describe() )
print( grouped_events2['event_score'].max() )

count    2.145179e+06
mean     1.391303e+00
std      1.639133e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      3.080000e+02
Name: event_score, dtype: float64
308


In [20]:
# 점수의 범위를 1 ~ 9로 조정
(grouped_events2['event_score'] > 9).sum()

grouped_events2['event_score'] = grouped_events2['event_score'].map(lambda score: score if score < 9 else 9)

grouped_events2['event_score'].min(), grouped_events2['event_score'].max()

(1, 9)

In [22]:
# 10% 데이터 뽑기

print( grouped_events2.index )

# import random
# random.randint(0, 2145179) X 2145179 반복

RangeIndex(start=0, stop=2145179, step=1)


In [26]:
# 전체 데이터 중 10%의 데이터만 뽑기 ( event_score의 분포가 전체 분포와 동일하도록 )
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
splits = skfold.split(grouped_events2, grouped_events2['event_score'])
for split_train, split_test in splits:
    idx_to_select = split_test
    break

ratings = grouped_events2.loc[idx_to_select, :].copy()

In [29]:
ratings.info()
ratings.max()

<class 'pandas.core.frame.DataFrame'>
Index: 214518 entries, 7 to 2145178
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   visitorid    214518 non-null  int64
 1   itemid       214518 non-null  int64
 2   event_score  214518 non-null  int64
dtypes: int64(3)
memory usage: 6.5 MB


visitorid      1407579
itemid          466867
event_score          9
dtype: int64

In [None]:
# 각 속성(컬럼)의 자료형을 변경해서 메모리 사용량 감소

In [None]:
data = Dataset.load_from_df(ratings_small[["userId", "movieId", "rating"]],
                            Reader(rating_scale=(0.5, 5.0)))