In [1]:
import numpy as np
import scipy as sp
from scipy import spatial
import pandas as pd
import pickle
import itertools
import operator

### set date

In [2]:
date = "2016-06-01"

### article df

In [3]:
def load_article(date):
    file = open("./data/article_" + date + ".plk", 'rb')
    article_df = pickle.load(file)
    article_df = article_df[np.invert(article_df.duplicated(subset="newsid"))] # remove duplication
    article_df = article_df[article_df["comment"] > 1000]
    file.close()
    return article_df

In [4]:
%time article_df = load_article(date)
print(len(article_df))
article_df.tail(2)

CPU times: user 45.8 ms, sys: 13 ms, total: 58.8 ms
Wall time: 57.3 ms
21


Unnamed: 0,newsid,oid,newspaper,title,link,comment,likeit,content,date,category
1939,3179775,23,조선일보,"&quot;미세먼지에 손놓은 정부, 애꿎은 고등어·삼겹살 탓&quot;",http://news.naver.com/main/read.nhn?mode=LSD&m...,2596,1759,"['미세먼지 스트레스'에 빠진 한국] - 무능한 정부에 뿔나""연일 잘못된 예보 남발...",2016-06-01,3
1977,2703707,32,경향신문,"스위스, 성인에 매달 300만원 ‘꿈’ 이뤄질까",http://news.naver.com/main/read.nhn?mode=LSD&m...,1069,461,ㆍ‘기본소득 지급안’ 놓고 5일 국민투표 실시“도둑을 교수형에 처하는 대신 모두에게...,2016-06-01,4


### comment df

In [5]:
def load_comment(date):
    file = open("./data/comment_" + date + ".plk", 'rb')
    comment_df = pickle.load(file)
    comment_df = comment_df[(comment_df["good"] > 0) & (comment_df["bad"] > 0)].reset_index(drop=True) # remove good:0, bad:0 
    comment_df = comment_df[comment_df["userIdNo"].str.len() < 10] # remove userIdNo > 10
    comment_df["aid"] = comment_df["aid"].apply(lambda aid: int(aid)) # change aid data type to int
    file.close()
    return comment_df

In [6]:
%time comment_df = load_comment(date)
print(len(comment_df))
comment_df.tail(2)

CPU times: user 353 ms, sys: 62.6 ms, total: 415 ms
Wall time: 413 ms
54799


Unnamed: 0,category,aid,oid,userIdNo,userName,good,bad,contents,regTime
55866,5,2620983,25,4RWHW,tars****,7.0,2.0,갤럭시앱이랑 플레이스토어만 쓰면 되는데...,2016-06-01 08:19:34
55867,5,2620983,25,13jF4,2858****,96.0,5.0,이거머냐 거부해도 못하게 악고 다른앱쓸려고하면 중요 어플다운로드중이라고하고 다른어플...,2016-06-01 02:25:22


### datas matrix

In [7]:
def make_datas(article_df, comment_df):
    
    # make zeros datas
    unique_user = comment_df["userIdNo"].unique()
    article_list = np.array(article_df["newsid"])
    datas = np.zeros([len(unique_user), len(article_list)])
    
    df = comment_df
    
    for idx, row in df.iterrows():
        
        userIdNo = row["userIdNo"]
        aid = row["aid"]
        
        # continue when no aid in article_list
        if aid not in article_list: 
            continue
            
        # fill values
        row_idx = int(np.where(unique_user==userIdNo)[0])
        aid_idx = int(np.where(article_list==int(aid))[0])
        value = int(datas[row_idx, aid_idx:aid_idx+1])
        datas[row_idx, aid_idx:aid_idx+1] = value + 1
        
    return datas, unique_user, article_list

In [8]:
%time datas, unique_user, article_list = make_datas(article_df, comment_df)

CPU times: user 29.3 s, sys: 113 ms, total: 29.4 s
Wall time: 29.5 s


### reduce sample

In [9]:
# remove user samples comment <= 10 (댓글이 10개 이하인 사용자들 제거)
unique_user = unique_user[datas.sum(axis=1) > 10]
datas = datas[datas.sum(axis=1) > 10]
len(datas), len(unique_user)

(95, 95)

### reduce column

In [10]:
article_list = article_list[datas.sum(axis=0) > 0]
datas = datas[:,datas.sum(axis=0) > 0]
len(article_list), datas.shape

(18, (95, 18))

### predict matrix

In [24]:
def predict_vector(datas, target_idx):
    
    dists = [ 
        (idx, spatial.distance.cosine(datas[target_idx,:], data))
        for idx, data in enumerate(datas) 
        if target_idx != idx
    ]
     
    dist_list = sorted(dists, key=lambda tup: tup[1])
    
    dist_index, dist_value = zip(*dist_list)
    
    # remove value 1 sample
    dist_index = np.array(dist_index)[np.array(dist_value) > 0][:5]
    dist_value = np.array(dist_value)[np.array(dist_value) > 0][:5]
    
    return datas[dist_index,:].mean(axis=0)
    
def make_predict(datas):
    
    predict_vectors = []
    
    for idx, data in enumerate(datas):
        
        if idx%100 == 0:
            print(idx, len(datas))
        
        predict_vectors.append( predict_vector(datas, idx) )
        
    return np.array(predict_vectors)

In [25]:
%time predict = make_predict(datas)

0 95
CPU times: user 650 ms, sys: 35.1 ms, total: 685 ms
Wall time: 664 ms


### MAE (mean absolute error)

In [44]:
def mae(data, predict):
    delta = data[data > 0] - predict[data > 0]
    return np.absolute(delta).sum()/len(delta)

def mae_mean(datas, predict):
    mae_list = []
    for idx in range(len(datas)):
        result_mae = mae(datas[idx,:], predict[idx,:])
        mae_list.append(result_mae)
    return np.array(mae_list).mean()

In [45]:
mae_mean(datas, predict)

3.1252631578947367

### transform to pandas dataframe

In [46]:
datas_df = pd.DataFrame(columns=article_list, data=datas)
datas_df.head()

Unnamed: 0,2086404,145956,2621084,10323763,414993,8444452,8444085,8443941,8442958,2087055,8442575,8442114,2976940,2835768,3179746,2621054,3179775,2703707
0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
predict_df = pd.DataFrame(columns=article_list, data=predict)
predict_df.head()

Unnamed: 0,2086404,145956,2621084,10323763,414993,8444452,8444085,8443941,8442958,2087055,8442575,8442114,2976940,2835768,3179746,2621054,3179775,2703707
0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.2,11.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,11.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.2,0.0,0.2,0.2,0.0,0.0,12.6,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.8,7.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


### recomend

In [69]:
def recommend(idx, datas, predict):
    recomend_article = article_list[datas[idx, :] == 0]
    recomend_predict = predict[idx, :][datas[idx, :] == 0]
    
    recomend_article = recomend_article[recomend_predict > 0]
    recomend_predict = recomend_predict[recomend_predict > 0]
    
    print(recomend_article)
    print(recomend_predict)
    
    result_list = []
    
    for i in range(len(recomend_article)):
        result_list.append((recomend_article[i], recomend_predict[i]))
        
    return sorted(result_list, key=lambda tup: tup[1])

In [71]:
result_recommend = recommend(2, datas, predict)
recomend_a, point = zip(*result_recommend)

[10323763  8444452  8444085  8442575]
[ 0.2  0.2  0.2  0.4]


In [84]:
for idx in range(len(recomend_a)):
    aritcle = article_df[article_df["newsid"] == recomend_a[idx]]["content"]
    print(aritcle.values, end="\n\n")

[ '<앵커 멘트> 표현의 자유를  용인하는 우리사회의 수준이 어느 정도인지  서울의 한 미술대생이 교내에 실험적 조형물을 설치했습니다. 인터넷상에서  물의를 빚고 있는, 이른바 <일베>를 형상화한  작품인데,  불과 수일 만에 파괴됐습니다.   어떻게 봐야 할까요?  옥유정 기자가 심층취재했습니다. <리포트> 서울 홍익대학교 앞. 한 남성이 커다란 손가락 모양의 조형물을 야구방망이로 부수고.. 밀어서 넘어뜨립니다. 조형물은 굉음을 내며 바닥으로 떨어지면서 부숴졌습니다. 이 조형물은 이 학교 조소과 4학년 학생이 교내 환경조각전에 출품한 작품이었습니다. 작품 설치 이후 일베의 상징물을 예술 소재로 삼은 것이 적절하냐는 논란이 불거졌습니다.<녹취> 김황경(인천시 계양구) : "예술이라고 하는 걸 핑계 삼아서 인식이 별로 안 좋게 보이는 거 다 아는데 만들었으니까 저도 좋진 않았죠."<녹취> 이시진(홍익대 전자전기공학부) : "물론 비판은 해야겠지만 그 표시만 보고서이건 절대악이다 하면서 달려드는 것은 문제가 있는 것 같다." 이 작품을 만든 학생은  사회적 논란이 되고 있는 인터넷 사이트 일베를 예술을 통해  공론화하려는 의도였다고 밝혔습니다.<녹취> 작품 제작자 : "우리 사회에 존재는 하지만 이게 실체는 없어서 저는 이걸 실체로 만들어서 그 일베를 사람들이 어떻게 생각하는지 보여줄 수 있다고 생각했습니다." 작품을 부순 사람은 계획된 행동이었다고 말했습니다.<녹취> 작품 파손자 : "\'표현의 자유다\'라고 말하는 게 제 생각에는 옳지 않은 것 같아서 많은 사람들이 불만을 품고 있다는 것에 대해 어느 정도 경각심을 느끼게 했으면 좋겠다고 생각해서..." 작품 파손에 대해서 홍익대학교 미술대학 교수들은 편가르기식 흑백논리에 의해 작품이 희생된 것은  안타까움을 넘어 걱정스러운 상황이라고 밝혔습니다.<인터뷰> 박창호(숭실대 사회학과 교수) :  "(작품이) 공공성에 위배된다거나공적 질서를 파괴하는 것으로 보지 않는 이상은 결국 물리력 행사를 통해서 파괴한다는 것