In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import scipy.sparse as sparse
import sklearn.metrics as metrics
import sys as sys





In [3]:
#load data
testdata = pd.read_csv("Test_Train_Sets/Reg_test.csv")
traindata = pd.read_csv("Test_Train_Sets/Reg_train.csv")
valdata = pd.read_csv("Test_Train_Sets/Reg_validate.csv")

In [4]:
traindata.shape, testdata.shape, valdata.shape

((10394, 18), (951, 18), (1268, 18))

In [5]:
traindata.head()

Unnamed: 0,Title,ID,Date,Class,Category,Author,ISBN,Publisher,Pub_Date,Order_Time,Count,Cart,Cart_Date,Device,Address1,Address2,user_purchase_count,book_sell_count
0,파스칼의 팡세,299,20141218,주문,종교,<블레즈 파스칼> 저/<조병준> 역,9788998000000.0,샘솟는기쁨,20140526,19,1,N,,기기PID_PC,경상북도,안동시,1,3
1,내 안의 죄 죽이기,736,20140603,주문,종교,<존 오웬> 저/<김창대> 역,9788993000000.0,브니엘,20140211,21,1,N,,기기PID_PC,충청남도,당진시,1,4
2,전능자의 그늘,937,20140912,주문,종교,<엘리자베스 엘리엇> 저/<윤종석> 역,9788990000000.0,복있는사람,20080630,22,1,N,,기기PID_PC,경기도,포천시,2,4
3,gods at war 거짓신들의 전쟁,937,20140912,주문,종교,<카일 아이들먼> 저/<배응준> 역,9788961000000.0,규장,20130227,22,1,N,,기기PID_PC,경기도,포천시,2,4
4,무정부주의와 기독교,1007,20140901,주문,종교,<자끄 엘륄> 저/<이창헌> 역,9788971000000.0,대장간,20111027,17,1,N,,기기PID_PC,경기도,의정부시,2,1


In [6]:
print '%d different books and %d different users in traindata' %(np.unique(traindata['Title']).shape[0],np.unique(traindata['ID']).shape[0])
print '%d different books and %d different users in testdata' %(np.unique(testdata['Title']).shape[0],np.unique(testdata['ID']).shape[0])
print '%d different books and %d different users in valdata' %(np.unique(valdata['Title']).shape[0],np.unique(valdata['ID']).shape[0])


4629 different books and 3815 different users in traindata
757 different books and 317 different users in testdata
971 different books and 317 different users in valdata


In [7]:
#dataframe that contains only useful informations
usefulcol = ['Title','ID','Class','Category','Author','ISBN','Count','Address1','Address2','Publisher']
dftrain = traindata[usefulcol]
dftest = testdata[usefulcol]
dfval = valdata[usefulcol]

In [8]:
#Function that calculates similarity between two strings
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [9]:
title_train = np.unique(dftrain['Title'].values)
author_train = np.unique(dftrain['Author'].values)
publisher_train = np.unique(dftrain['Publisher'].values)
title_train_dict = {i:t for i,t in enumerate(title_train)}
author_train_dict = {i:t for i,t in enumerate(author_train)}
publisher_train_dict = {i:t for i,t in enumerate(publisher_train)}
print title_train.shape[0],author_train.shape[0],publisher_train.shape[0]

4629 3267 729


In [10]:
import re
def author_to_list(author):
    #split the list of authors in a single string into a numpy list of authors. remove brackets, '저', '역', '그림', '/', and etc
    #remove translater
    if author != author:
        newauthorlist = []
    else:
        split_list = np.array(['/'])
        mask = np.array([s in author for s in split_list])
        split_list = split_list[mask]
        
        for s in split_list:
            author = re.sub(s,' ..',author)
        authorlist = author.split(' ..')
        authorlist = np.array(filter(None,authorlist))
        
        translatormask = np.array([(' 역' in a) or (' 공역' in a) or 
                                   (' 편역' in a) or (' 감수' in a) or 
                                   (' 옮김' in a) or (' 등역' in a) or
                                   (' 엮음' in a) for a in authorlist])
        authorlist = authorlist[~translatormask]
        
        remove_str_list = np.array(['<','>',' 그림',' 저',' 공저',' 글', ' 등저',' 공편','편'])
        newauthorlist = np.array([])
        for astr in authorlist:
            alist = np.array(astr.split(','))
            for i,a in enumerate(alist):
                for s in remove_str_list:
                    a = re.sub(s,'',a)
                alist.flat[i]=a
            newauthorlist = np.append(newauthorlist,alist)
                
    return newauthorlist       

In [97]:
#dataframe of all books (no duplicates)
dfbook = dftrain[['Author','Title','Publisher','ISBN']]
dfbook = dfbook[~dfbook.duplicated()]
dfbook = dfbook.reset_index()
dfbook.drop('index', axis=1, inplace=True)

In [13]:
#Make list of all authors
author_entire_list = np.array([])
for a in author_train:
    author_entire_list = np.append(author_entire_list,author_to_list(a))
author_entire_list = np.unique(author_entire_list)
author_entire_list.shape

(2919,)

In [15]:
#Make binary author lists: Nbooks X Nauthors matrix
#Use author_entire_list for index-author matching
Xauthor = np.zeros((dfbook.index.shape[0],author_entire_list.shape[0]))
for ind in dfbook.index:
    alist = author_to_list(dfbook.iloc[ind]['Author'])
    for a in alist:
        aind = np.where(author_entire_list==a)[0][0]
        Xauthor[ind,aind] = 1

In [16]:
#np.savetxt('Xauthor_Rel.txt',Xauthor)

In [17]:
#Make binary publisher lists: Nbooks X Npublihser matrix
#use publisher_train or publisher_train_dict for index matching
Xpublisher = np.zeros((dfbook.index.shape[0],publisher_train.shape[0]))
for ind in dfbook.index:
    pub = dfbook.iloc[ind]['Publisher']
    pind = np.where(publisher_train==pub)[0][0]
    Xpublisher[ind,pind] = 1

In [18]:
#np.savetxt('Xpublisher_Rel.txt',Xpublisher)

In [19]:
%%time
#Make similarity matrix for titles: Nbooks X Nbooks matrix
#Use dfbooks for index-title matching
Nbooks =dfbook.index.shape[0]
simmat_title = np.zeros((Nbooks,Nbooks))
for i in range(Nbooks):
    simmat_title[i,i] = 1.0
    for j in range(i+1,Nbooks):
        simmat_title[i,j] = similar(dfbook.iloc[i]['Title'],dfbook.iloc[j]['Title'])
        simmat_title[j,i] = simmat_title[i,j]


CPU times: user 1h 51min 22s, sys: 2min 25s, total: 1h 53min 47s
Wall time: 1h 59min 25s


In [20]:
#np.savetxt('simmat_title_Rel.txt',simmat_title)

In [21]:
#similarity matrices of author and publisher
simmat_author = Xauthor.dot(Xauthor.T)
simmat_publisher = Xpublisher.dot(Xpublisher.T)

In [30]:
def recommend_based_on_similarity(dftrain,userid,Nrec,simmat,dfbook=dfbook):
    #recommend Nrec books similar to the books that the chosen user has bought so far
    #input: 
    # dftrain: training data
    # userid: id of the user
    # Nrec: Number of books to be recommended
    # dfbook: the dataframes of books
    # simmat: Nbooks X Nbooks similarity matrix
    usertrans = dftrain[dftrain['ID']==userid]  #user transaction
    usertrans = usertrans[['Author','Title','Publisher','ISBN']]
    usertrans = usertrans[~usertrans.duplicated()]
    excludeind = np.array([])
    simvec = np.zeros(simmat.shape[1])
    for ind in usertrans.index:
        bookind = dfbook[(dfbook['Title']==usertrans[usertrans.index == ind]['Title'].values[0]) 
                         & (dfbook['Publisher']==usertrans[usertrans.index == ind]['Publisher'].values[0])].index[0]
        simvec = simvec+simmat[bookind,:]
        excludeind = np.append(excludeind,[bookind])
    simvec = simvec/usertrans.index.shape[0]
    dfsimvec = dfbook
    dfsimvec['Similarity'] = simvec

    dfsimvec = dfsimvec.iloc[filter(lambda x: x not in excludeind,dfsimvec.index)]
    return dfsimvec.sort('Similarity',ascending=False)[:Nrec]
    
    

In [83]:
#Function that estimates the prediction accuracy
def predict_percentage(dftrain,dftest,Nrec,weightdict,simmat_author=simmat_author,
                       simmat_title=simmat_title,simmat_publisher=simmat_publisher,dfbook=dfbook):
    simmat = weightdict['Author']*simmat_author+weightdict['Title']*simmat_title
    +weightdict['Publisher']*simmat_publisher
    testusers = np.unique(dftest['ID'].values)
    totalbooks = 0
    totalsuccess = 0
    for user in testusers:
        dfrec = recommend_based_on_similarity(dftrain,user,Nrec,simmat,dfbook=dfbook)
        dftestuser = dftest[dftest['ID']==user]
        totalbooks = totalbooks+Nrec
        for i in range(dfrec.shape[0]):
            if np.any((dftestuser['Title']==dfrec.iloc[i]['Title']) 
                & (dftestuser['Publisher']==dfrec.iloc[i]['Publisher'])):
                totalsuccess=totalsuccess+1
    return float(totalsuccess)/float(totalbooks)
            
        

In [34]:
%%time{
#Grid Search to find best combination of (w_A,w_P,w_T)
stepsize = 0.05

Aws = np.arange(0,1,stepsize)

accuracy_list = []
for A in Aws:
    Tws = np.arange(0,1-A,stepsize)
    for T in Tws:
        P = 1-A-T
        weightdict = {'Author':A,'Publisher':P,'Title':T}
        percentage = predict_percentage(dftrain,dfval,4,weightdict)
        accuracy = weightdict
        accuracy['accuracy'] = percentage
        accuracy_list = accuracy_list+[accuracy]
        

CPU times: user 1h 33min 34s, sys: 3min 8s, total: 1h 36min 43s
Wall time: 1h 47min 7s


In [35]:
import json
with open('accuracy_rel.json', 'w') as fp:
    json.dump(accuracy_list, fp)


In [98]:
accuracy_df = pd.DataFrame(accuracy_list)
accuracy_df = accuracy_df.sort('accuracy',ascending=False)

In [76]:
bestweightdict = dict(accuracy_df[accuracy_df['accuracy']==accuracy_df['accuracy'].max()].mean())
print bestweightdict

{'Publisher': 0.33333333333333337, 'Title': 0.16111111111111109, 'accuracy': 0.048895899053627803, 'Author': 0.50555555555555554}


In [79]:
#merge train and validation set
dftrainandval = pd.concat([dftrain,dfval])
dftrainandval = dftrainandval.reset_index()
dftrainandval.drop('index', axis=1, inplace=True)

#dataframe of all books in train and val data(no duplicates)
dfbookall = dftrainandval[['Author','Title','Publisher','ISBN']]
dfbookall = dfbookall[~dfbookall.duplicated()]
dfbookall = dfbookall.reset_index()
dfbookall.drop('index', axis=1, inplace=True)


In [80]:
author_entire_list_all = np.array([])
for a in np.unique(dfbookall['Author']):
    author_entire_list_all = np.append(author_entire_list_all,author_to_list(a))
author_entire_list_all = np.unique(author_entire_list_all)
author_entire_list_all.shape

(3125,)

In [81]:
#make new simmat for merged dataframe
#Make binary author lists: Nbooks X Nauthors matrix
#Use author_entire_list for index-author matching
Xauthor_all = np.zeros((dfbookall.index.shape[0],author_entire_list_all.shape[0]))
for ind in range(dfbookall.shape[0]):
    alist = author_to_list(dfbookall.iloc[ind]['Author'])
    for a in alist:
        aind = np.where(author_entire_list_all==a)[0][0]
        Xauthor_all[ind,aind] = 1
        
#Make binary publisher lists: Nbooks X Npublihser matrix
#use publisher_train or publisher_train_dict for index matching
publisher_list_all = np.unique(dfbookall['Publisher'])
Xpublisher_all = np.zeros((dfbookall.index.shape[0],publisher_list_all.shape[0]))
for ind in range(dfbookall.shape[0]):
    pub = dfbookall.iloc[ind]['Publisher']
    pind = np.where(publisher_list_all==pub)[0][0]
    Xpublisher_all[ind,pind] = 1

In [82]:
%%time
#Update similarity matrix for titles:
Nbooks_all =dfbookall.index.shape[0]
simmat_title_all = np.zeros((Nbooks_all,Nbooks_all))
Nbooks = dfbook.index.shape[0]
for i in range(Nbooks,Nbooks_all):
    for j in range(Nbooks):
        simmat_title_all[i,j] = similar(dfbookall.iloc[i]['Title'],dfbookall.iloc[j]['Title'])
        simmat_title_all[j,i] = simmat_title_all[i,j]
        
for i in range(Nbooks,Nbooks_all):
    simmat_title_all[i,i] = 1.0
    for j in range(i+1,Nbooks_all):
        simmat_title_all[i,j] = similar(dfbookall.iloc[i]['Title'],dfbookall.iloc[j]['Title'])
        simmat_title_all[j,i] = simmat_title_all[i,j]


CPU times: user 25min 23s, sys: 10.6 s, total: 25min 34s
Wall time: 31min 30s


In [84]:
simmat_author_all = Xauthor_all.dot(Xauthor_all.T)
simmat_publisher_all = Xpublisher_all.dot(Xpublisher_all.T)

In [85]:
#Estimate Test error
testerror = predict_percentage(dftrainandval,dftest,3,bestweightdict,
                               simmat_author=simmat_author_all,simmat_title=simmat_title_all,
                               simmat_publisher=simmat_publisher_all,dfbook=dfbookall)
print testerror

0.0399579390116


In [99]:
dfbook[dfbook['Title']=='무정부주의와 기독교']

Unnamed: 0,Author,Title,Publisher,ISBN
4,<자끄 엘륄> 저/<이창헌> 역,무정부주의와 기독교,대장간,9788971000000.0


In [100]:
simmat = bestweightdict['Author']*simmat_author+bestweightdict['Publisher']*simmat_publisher+bestweightdict['Title']*simmat_title
simvec = simmat[4,:]

In [101]:
print dfbook.iloc[4]
dfsimvec = dfbook
dfsimvec['similarity'] = simvec

Author       <자끄 엘륄> 저/<이창헌> 역
Title               무정부주의와 기독교
Publisher                  대장간
ISBN              9.788971e+12
Name: 4, dtype: object


In [102]:
#Recommend 30 books similar to the chosen book
dfsimvec = dfsimvec.sort('similarity',ascending=False)
dfsimvec = (dfsimvec[['ISBN','similarity','Title','Author','Publisher']])
dfsimvec.iloc[0:30].to_csv('무정부주의와기독교_similarity.csv')