# using collaboration-filter (user)

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import pickle
import itertools
import operator

### get data

In [None]:
def get_data(date):
    
    article_df, comment_df = "", ""
    
    with open("./data/article_" + date + ".plk", 'rb') as file:
        article_df = pickle.load(file)
        # remove overlab
        article_df = article_df[np.invert(article_df.duplicated(subset="newsid"))]
        
    with open("./data/comment_" + date + ".plk", 'rb') as file:
        comment_df = pickle.load(file)
        
    return article_df, comment_df

In [None]:
date = "2016-06-01"
article_df, comment_df = get_data(date)
len(article_df), len(comment_df)

### make data df

In [None]:
def aritcle_user(article_df, comment_df, search_user_num=0, user_comment_num=0):

    aid_columns = list(article_df["newsid"])
    user_id_list = comment_df["userIdNo"].unique()
    
    # set search as user comment number
    if search_user_num == 0:
        
        search_user_num = len(user_id_list)
    
    # make list that user_id and aid
    user_list = []
    
    for idx, user_id in enumerate(user_id_list[:search_user_num]):
        
        if idx%5000 == 0:
            print(idx, search_user_num)
        
        aid_list = list(comment_df[comment_df["userIdNo"] == user_id]["aid"])
        aid_list = [ int(aid) for aid in aid_list]
        
        tmp_dict = {
            "user_id": user_id,
            "aid": [ aid for aid in aid_list if aid in aid_columns ] 
        }

        user_list.append(tmp_dict)
    
    # cut as user comment number
    result_user_list = []
    
    if user_comment_num != 0:
        
        result_user_list = [ user for user in user_list if len(user["aid"]) > user_comment_num]
        
    else:
        
        result_user_list = user_list
    
    return aid_columns, result_user_list

In [None]:
def user_aid_count(aid_column, user_aid):
    
    count = 0
    
    for aid in user_aid:
        
        if aid_column == aid:
            
            count += 1
            
    return count 

def make_data_set(aid_columns, user_list):
    
    # make pandas dataframe into aid_columns and user_sample
    df = pd.DataFrame(columns=aid_columns)
    
    for idx, user in enumerate(user_list):
        
        if idx%5000 == 0:
            print(idx, len(user_list))
        
        user_sample = []
        
        for idx, aid_column in enumerate(aid_columns):

            if aid_column in user["aid"]:
                
                count = user_aid_count(aid_column, user["aid"])
                user_sample.append(count)
                
            else:
                
                user_sample.append(0)
                
        df.loc[len(df)] = user_sample
    
    # remove sum 0 colums from dataframe
    remove_column_list = list(df.columns[df.sum(axis=0) == 0])
    df.drop(remove_column_list, axis=1, inplace=True)
    
    return df

In [None]:
# aritcle_user(,,사용자수, 사용자댓글수)
%time aid_columns, user_list = aritcle_user(article_df, comment_df, 0, 0)
len(aid_columns), len(user_list)

In [None]:
%time data_df = make_data_set(aid_columns, user_list)

In [None]:
# save
date = "2016-06-01"
with open("./data/data_df_" + date + ".plk", 'wb') as file:
    pickle.dump(data_df, file)

In [None]:
# load
date = "2016-06-01"

with open("./data/data_df_" + date + ".plk", 'rb') as file:
    data_df = pickle.load(file)
    print(len(data_df))


### predict df

In [None]:
def calc_cosin_dist(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)  
    dist = 1.0 - np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return dist

def get_dist_list(df, user_num=0):
    
    sample = df.loc[user_num]
    
    dists = []
    
    for idx, row in df.iterrows():
        
        dist = calc_cosin_dist(sample, row)
        if dist < 0.00001:
            dist = 0
        dists.append((idx, dist))
    
    return dists
    
def sort_dists(dists, user_num=0):

    if user_num == 0:
        user_num = len(dists)
    
    result_list = []
    
    for i, dist in sorted(dists, key=operator.itemgetter(1)):
        result_list.append((i, dist))
    
    result_list = result_list[1:user_num+1]
    
    return result_list

In [None]:
def dists(data_df):
    dist_list = []
    
    for user in data_df.index:
        dists = get_dist_list(data_df, user)
        dists = sort_dists(dists, 3)
        dist_list.append(dists)
    return dist_list

In [None]:
def predict(data_df):
    
    dist_list = dists(data_df)
    
    predict_df = pd.DataFrame(columns=data_df.columns)
    
    for dist in dist_list:
    
        dist_index, dist_value = zip(*dist)
        recomend_aids = data_df.loc[dist_index,:].mean(axis=0)
        predict_df.loc[len(predict_df)] = recomend_aids
        
    return predict_df

In [None]:
%time predict_df = predict(data_df)

### compare test

In [None]:
def compare_test(data_df, predict_df):
    
    same_count = 0
    
    for idx1 in data_df.index:
            
        for idx2 in data_df.columns:
            if (data_df.loc[idx1][idx2] == 0) and (predict_df.loc[idx1][idx2] == 0):
                same_count += 1
            elif (data_df.loc[idx1][idx2] > 0) and (predict_df.loc[idx1][idx2] > 0):
                same_count += 1
                
    return same_count / (len(data_df.index)*len(data_df.columns))
        

In [None]:
%time compare_test(data_df, predict_df)

### recomend

In [None]:
def recomend(data_df, predict_df, idx=0):
    
    predict_datas = predict_df.loc[idx]
    user_datas = data_df.loc[idx]
    
    predict_datas = predict_datas[predict_datas > 0]
    user_datas = user_datas[user_datas > 0]
    
    # remove already write comment aid
    for predict_idx in predict_datas.index:
        for user_idx in user_datas.index:
            if predict_idx == user_idx:
                predict_datas = predict_datas.drop(user_idx)
    
    # sorting
    predict_datas = predict_datas.sort_values(ascending=False)

    return predict_datas

In [None]:
predict_datas = recomend(data_df, predict_df, 0)

In [None]:
def recomend(predict_df):
    predict_datas = predict_df.loc[0]
    user_datas = predict_df.loc[1]
    
    predict_datas = predict_datas[predict_datas > 0]
    user_datas = user_datas[user_datas > 0]
    
    # remove already write comment aid
    for predict_idx in predict_datas.index:
        for user_idx in user_datas.index:
            if predict_idx == user_idx:
                predict_datas = predict_datas.drop(user_idx)
    
    # sorting
    predict_datas = predict_datas.sort_values(ascending=False)
    
    return predict_datas, user_datas

In [None]:
predict_datas, user_datas = recomend(predict_df)
len(predict_datas), len(user_datas), predict_datas

### content-based => factor => Matirix factorization
- 몇개의 factor를 조합해서 (예: 해피엔딩, 배드엔딩 / 코메디, 새드 ) 몇가지 factor로 데이터를 추론함
- 사용자 10억, 영화1억, 요소 4개 -> 사용자(matrix : 10억*4) * 영화(matrix : 4*1억) = 추천지수

$Q^TP = R$ 최소화 하는 Q,P를 찾아야함

### collaboration-filter
- User-Based : user를 기준으로 비슷한 user를 찾아서 채움
- Item-Based : item을 기준으로 비슷한 item을 찾아서 채움

빈칸 메우기 - Imputation
실제 데이터 - 타이타닉 데이터

pd - groupby transform를 이용해서 비슷한 데이터에 대한 빈칸을 채움

- User-Based
similarity matrix
  | A | B | C
\----------------
 A| 1   -   2
\----------------
 B| 0   2   -
\----------------
 C| 2   3   4

사용자에 대해서 하고 sorting이나 ranking을 한다.
한 10명정도에 대한 평균을 낸다.(가중치를 줘서 weighted average를 준다.)

In [None]:
word2vec - 단어 예측기

- distributed word vectorigation

gre문제 


- 확률적으로 단어를 나오게 마코프 체인을 많이 사용

