In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

In [8]:
class KModesAlgorithm():

    def __init__(self, k, max_step=5, n_iter=5, random_state=100, init_cci=None):
        self.k = k ## 클러스터 개수
        self.max_step = max_step ## 최대 스텝 수
        self.n_iter = n_iter ## 초기 클러스터 중심 샘플링 횟수
        self.random_state = random_state ## 랜덤 시드
        self.X_categorized = None ## 숫자로 범주화된 데이터
        self.cluster_center = None ## 최종 클러스터 중심
        self.cat_map_dict = None ## 범주를 숫자로 바꾸기 위한 딕셔너리
        self.predicted_cluster = None ## 최종 할당 클러스터
        self.init_cci = init_cci ## 초기 클러스터 인덱스

    def get_mode(self, x):
        return np.bincount(x).argmax()

    def cat_to_num(self, X):
        num_feature = X.shape[1]
        cat_map_dict = dict()
        
        for i in range(num_feature):
            uniq_val = np.unique(X[:,i])
            cat_to_num = dict()
            num_to_cat = dict()
            
            for j, uv in enumerate(uniq_val):
                cat_to_num[uv] = j
                num_to_cat[j] = uv
            cat_map_dict[i] = [cat_to_num, num_to_cat]
        self.cat_map_dict = cat_map_dict

    def dissimilarity(self, x, y):
        return np.sum(x!=y)

    def fit(self, X):
        is_fitting = True
        X = X.copy()
        self.cat_to_num(X)
        k = self.k
        
        ## category to num
        for i in range(X.shape[1]):
            col_cat_map = self.cat_map_dict[i][0]
            X[:, i] = list(map(lambda x: col_cat_map[x], X[:, i]))
        X = X.astype(np.int64)
        self.X_categorized = X
        
        ## choose initial cluster center
        if self.init_cci is None:
            np.random.seed(self.random_state)
            obj_val = np.infty
            for _ in range(self.n_iter):
                cluster_center_idx = np.random.choice(range(X.shape[0]), size=k, replace=False)
                
                ## Assign Cluster
                cluster_center = X[cluster_center_idx, :]
                predicted_cluster = self.predict(X, cluster_center, is_fitting)    
                cur_val = 0

                for i, pc in enumerate(predicted_cluster):
                    cur_val += self.dissimilarity(X[i:], cluster_center[pc])
                if cur_val < obj_val:
                    obj_val = cur_val
                    opt_cci = cluster_center_idx
            self.init_cci = cluster_center_idx

        ## Apply Algorithm
        step = 1
        cur_cluster_center = X[self.init_cci, :]
        cur_predicted_cluster = self.predict(X, cur_cluster_center, is_fitting)

        while step <= self.max_step:

            ## Update Center
            next_cluster_center = []

            for c in np.unique(cur_predicted_cluster):
                c_idx = np.where(cur_predicted_cluster == c)[0]
                temp_cluster_center = np.apply_along_axis(self.get_mode, 0, X[c_idx,:])
                temp_cluster_center = temp_cluster_center.tolist()
                next_cluster_center.append(temp_cluster_center)

            next_cluster_center = np.array(next_cluster_center)
            
            ## Assign Cluster
            next_predicted_cluster = self.predict(X, next_cluster_center, is_fitting)

            ## Stop Criterion
            if len(np.unique(next_predicted_cluster)) != k:
                self.predicted_cluster = cur_predicted_cluster
                self.cluster_center = cur_cluster_center
                break

            if all(cur_predicted_cluster == next_predicted_cluster):
                self.predicted_cluster = cur_predicted_cluster
                self.cluster_center = cur_cluster_center
                break

            else:
                cur_cluster_center = next_cluster_center
                cur_predicted_cluster = next_predicted_cluster
                step += 1

        self.predicted_cluster = next_predicted_cluster
        self.cluster_center = next_cluster_center

        return self

    ## Assign Clusters

    def predict(self, X, cluster_center=None, is_fitting=False):
        if is_fitting:
            return np.array([self._predict(x, cluster_center) for x in X])

        else:
            for i in range(X.shape[1]):
                col_cat_map = self.cat_map_dict[i][0]
                X[:, i] = list(map(lambda x: col_cat_map[x], X[:, i]))
            X = X.astype(np.int64)

            return np.array([self._predict(x, cluster_center) for x in X])

    def _predict(self, x, cluster_center=None, is_fitting=False):

        if cluster_center is None:
            cluster_center = self.cluster_center
        return np.argmin([self.dissimilarity(x, cc) for cc in cluster_center])

In [10]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

In [11]:
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
watch_e_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
search_df =  pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')
meta_plus_df = pd.read_csv(os.path.join(data_path, 'meta_data_plus.csv'), encoding='utf-8')

In [16]:
import pickle
with open('tag_for_all_id.pickle', 'rb') as f:
    tag = pickle.load(f)

In [17]:
tag

Unnamed: 0,album_id,tag
0,749,타요
1,750,타요
2,2131,타요
3,2625,타요
4,2594,타요
...,...,...
39870,39872,교육
39871,39873,교육
39872,39874,교육
39873,4779,동화


In [22]:
genre_cast = meta_df.drop_duplicates('album_id')[['album_id', 'genre_mid', 'cast_1']]

In [27]:
genre_cast.loc[genre_cast['genre_mid'] == '노래 율동'] = '노래율동'

In [62]:
album_df = pd.merge(genre_cast, tag, on = 'album_id', how = 'left').fillna('unknown')

In [63]:
album_df

Unnamed: 0,album_id,genre_mid,cast_1,tag
0,749,TV만화,타요,타요
1,750,TV만화,타요,타요
2,2131,TV만화,타요,타요
3,2625,TV만화,타요,타요
4,2594,TV만화,타요,타요
...,...,...,...,...
39870,39872,놀이교실,unknown,교육
39871,39873,놀이교실,unknown,교육
39872,39874,놀이교실,unknown,교육
39873,4779,책,unknown,동화


In [56]:
le = LabelEncoder()
album_df['genre_mid'] = le.fit_transform(album_df['genre_mid'])
album_df['cast_1'] = le.fit_transform(album_df['cast_1'])
album_df['tag'] = le.fit_transform(album_df['tag'])

In [66]:
X = album_df[['genre_mid', 'cast_1', 'tag']].values ## 데이터

In [67]:
X

array([['TV만화', '타요', '타요'],
       ['TV만화', '타요', '타요'],
       ['TV만화', '타요', '타요'],
       ...,
       ['놀이교실', 'unknown', '교육'],
       ['책', 'unknown', '동화'],
       ['노래율동', '타요와 친구들', '교육']], dtype=object)

In [76]:
my_kmodes = KModesAlgorithm(k = 3, init_cci=[0, 128, 12588]).fit(X) ## 클래스 초기화 및 클러스터링
pred_cluster = my_kmodes.predict(X) ## 최종 클러스터

In [82]:
X.shape

(39875, 3)

In [88]:
my_kmodes4 = KModesAlgorithm(k = 4, init_cci=[0, 12588, 27669, 33239]).fit(X) ## 클래스 초기화 및 클러스터링
pred_cluster4 = my_kmodes4.predict(X) ## 최종 클러스터
print(pd.Series(pred_cluster4).value_counts())

0    20690
1     7317
3     6447
2     5421
dtype: int64


In [85]:
for i in range(3, 11):
    my_kmodes = KModesAlgorithm(k = i, init_cci=[0, 12588, 27669]).fit(X)
    pred_cluster = my_kmodes.predict(X)
    print( pd.Series(pred_cluster).value_counts() )

0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64
0    27137
1     7317
2     5421
dtype: int64


In [81]:
pd.Series(pred_cluster).value_counts()

0    26102
1    10079
2     3694
dtype: int64