In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import scipy.sparse as sparse
import sklearn.metrics as metrics
import sys as sys





In [2]:
testdata = pd.read_csv("Test_Train_Sets/Sel_test.csv")
traindata = pd.read_csv("Test_Train_Sets/Sel_train.csv")
valdata = pd.read_csv("Test_Train_Sets/Sel_validate.csv")

In [3]:
traindata.shape, testdata.shape, valdata.shape

((15808, 18), (1194, 18), (1592, 18))

In [4]:
print '%d different books and %d different users in traindata' %(np.unique(traindata['Title']).shape[0],np.unique(traindata['ID']).shape[0])
print '%d different books and %d different users in testdata' %(np.unique(testdata['Title']).shape[0],np.unique(testdata['ID']).shape[0])
print '%d different books and %d different users in valdata' %(np.unique(valdata['Title']).shape[0],np.unique(valdata['ID']).shape[0])


2528 different books and 7204 different users in traindata
559 different books and 398 different users in testdata
679 different books and 398 different users in valdata


In [5]:
#dataframe that contains only useful informations
usefulcol = ['Title','ID','Class','Category','Author','ISBN','Count','Address1','Address2','Publisher']
dftrain = traindata[usefulcol]
dftest = testdata[usefulcol]
dfval = valdata[usefulcol]

In [6]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [7]:
title_train = np.unique(dftrain['Title'].values)
author_train = np.unique(dftrain['Author'].values)
publisher_train = np.unique(dftrain['Publisher'].values)
title_train_dict = {i:t for i,t in enumerate(title_train)}
author_train_dict = {i:t for i,t in enumerate(author_train)}
publisher_train_dict = {i:t for i,t in enumerate(publisher_train)}
print title_train.shape[0],author_train.shape[0],publisher_train.shape[0]

2528 2128 681


In [8]:
import re
def author_to_list(author):
    #split the list of authors in a single string into a numpy list of authors. remove brackets, '저', '역', '그림', '/', and etc
    #remove translater
    if author != author:
        newauthorlist = []
    else:
        split_list = np.array(['/'])
        mask = np.array([s in author for s in split_list])
        split_list = split_list[mask]
        
        for s in split_list:
            author = re.sub(s,' ..',author)
        authorlist = author.split(' ..')
        authorlist = np.array(filter(None,authorlist))
        
        translatormask = np.array([(' 역' in a) or (' 공역' in a) or 
                                   (' 편역' in a) or (' 감수' in a) or 
                                   (' 옮김' in a) or (' 등역' in a) or
                                   (' 엮음' in a) for a in authorlist])
        authorlist = authorlist[~translatormask]
        
        remove_str_list = np.array(['<','>',' 그림',' 저',' 공저',' 글', ' 등저',' 공편','편'])
        newauthorlist = np.array([])
        for astr in authorlist:
            alist = np.array(astr.split(','))
            for i,a in enumerate(alist):
                for s in remove_str_list:
                    a = re.sub(s,'',a)
                alist.flat[i]=a
            newauthorlist = np.append(newauthorlist,alist)
                
    return newauthorlist       

In [9]:
#dataframe of all books (no duplicates)
dfbook = dftrain[['Author','Title','Publisher','ISBN']]
dfbook = dfbook[~dfbook.duplicated()]
dfbook = dfbook.reset_index()
dfbook.drop('index', axis=1, inplace=True)

In [10]:
author_entire_list = np.array([])
for a in author_train:
    author_entire_list = np.append(author_entire_list,author_to_list(a))
author_entire_list = np.unique(author_entire_list)
author_entire_list.shape

(2253,)

In [11]:
author_entire_list_dict = {i:a for i,a in enumerate(author_entire_list)}

In [12]:
#Make binary author lists: Nbooks X Nauthors matrix
#Use author_entire_list for index-author matching
Xauthor = np.zeros((dfbook.index.shape[0],author_entire_list.shape[0]))
for ind in dfbook.index:
    alist = author_to_list(dfbook.iloc[ind]['Author'])
    for a in alist:
        aind = np.where(author_entire_list==a)[0][0]
        Xauthor[ind,aind] = 1

In [13]:
np.savetxt('Xauthor_Sel.txt',Xauthor)

In [14]:
#Make binary publisher lists: Nbooks X Npublihser matrix
#use publisher_train or publisher_train_dict for index matching
Xpublisher = np.zeros((dfbook.index.shape[0],publisher_train.shape[0]))
for ind in dfbook.index:
    pub = dfbook.iloc[ind]['Publisher']
    pind = np.where(publisher_train==pub)[0][0]
    Xpublisher[ind,pind] = 1

In [15]:
np.savetxt('Xpublisher_Sel.txt',Xpublisher)

In [17]:
%%time
#Make similarity matrix for titles: Nbooks X Nbooks matrix
#Use dfbooks for index-title matching
Nbooks =dfbook.index.shape[0]
simmat_title = np.zeros((Nbooks,Nbooks))
for i in range(Nbooks):
    simmat_title[i,i] = 1.0
    for j in range(i+1,Nbooks):
        simmat_title[i,j] = similar(dfbook.iloc[i]['Title'],dfbook.iloc[j]['Title'])
        simmat_title[j,i] = simmat_title[i,j]


CPU times: user 26min 25s, sys: 27.9 s, total: 26min 53s
Wall time: 26min 53s


In [18]:
np.savetxt('simmat_title_Sel.txt',simmat_title)

In [19]:
simmat_author = Xauthor.dot(Xauthor.T)
simmat_publisher = Xpublisher.dot(Xpublisher.T)

In [20]:
simmat = 0.5*simmat_title+0.4*simmat_author+0.1*simmat_publisher

In [21]:
def recommend_based_on_similarity(dftrain,userid,Nrec,simmat,dfbook=dfbook):
    #recommend Nrec books similar to the books that the chosen user has bought so far
    #input: 
    # dftrain: training data
    # userid: id of the user
    # Nrec: Number of books to be recommended
    # dfbook: the dataframes of books
    # simmat: Nbooks X Nbooks similarity matrix
    usertrans = dftrain[dftrain['ID']==userid]  #user transaction
    usertrans = usertrans[['Author','Title','Publisher','ISBN']]
    usertrans = usertrans[~usertrans.duplicated()]
    excludeind = np.array([])
    simvec = np.zeros(simmat.shape[1])
    for ind in usertrans.index:
        bookind = dfbook[(dfbook['Title']==usertrans[usertrans.index == ind]['Title'].values[0]) 
                         & (dfbook['Publisher']==usertrans[usertrans.index == ind]['Publisher'].values[0])].index[0]
        simvec = simvec+simmat[bookind,:]
        excludeind = np.append(excludeind,[bookind])
    simvec = simvec/usertrans.index.shape[0]
    dfsimvec = dfbook
    dfsimvec['Similarity'] = simvec

    dfsimvec = dfsimvec.iloc[filter(lambda x: x not in excludeind,dfsimvec.index)]
    return dfsimvec.sort('Similarity',ascending=False)[:Nrec]
    


In [22]:
recommend_based_on_similarity(dftrain,dftest['ID'].values[10],3,simmat)

Unnamed: 0,Author,Title,Publisher,ISBN,Similarity
1400,<이지성> 저,[염가한정판매] 리딩으로 리드하라,문학동네,9788955000000.0,0.242798
1818,<샘 혼> 저/<이상원> 역,함부로 말하는 사람과 대화하는 법,갈매나무,9788994000000.0,0.238239
19,<샘 혼> 저/<이상원> 역,적을 만들지 않는 대화법,갈매나무,9788996000000.0,0.237193


In [30]:
def predict_percentage(dftrain,dftest,Nrec,weightdict,simmat_author=simmat_author,
                       simmat_title=simmat_title,simmat_publisher=simmat_publisher,dfbook=dfbook):
    simmat = weightdict['Author']*simmat_author+weightdict['Title']*simmat_title
    +weightdict['Publisher']*simmat_publisher
    testusers = np.unique(dftest['ID'].values)
    totalbooks = 0
    totalsuccess = 0
    for user in testusers:
        dfrec = recommend_based_on_similarity(dftrain,user,Nrec,simmat,dfbook=dfbook)
        dftestuser = dftest[dftest['ID']==user]
        totalbooks = totalbooks+Nrec
        for i in range(dfrec.shape[0]):
            if np.any((dftestuser['Title']==dfrec.iloc[i]['Title']) 
                & (dftestuser['Publisher']==dfrec.iloc[i]['Publisher'])):
                totalsuccess=totalsuccess+1
    return float(totalsuccess)/float(totalbooks)
            
        

In [24]:
%%time
weightdict = {'Author':0.4,'Title':0.5,'Publisher':0.1}
print predict_percentage(dftrain,dfval,4,weightdict)

0.0270100502513
CPU times: user 16.1 s, sys: 129 ms, total: 16.2 s
Wall time: 16.5 s


In [25]:
%%time
stepsize = 0.05

Aws = np.arange(0,1,stepsize)

accuracy_list = []
for A in Aws:
    Tws = np.arange(0,1-A,stepsize)
    for T in Tws:
        P = 1-A-T
        weightdict = {'Author':A,'Publisher':P,'Title':T}
        percentage = predict_percentage(dftrain,dfval,4,weightdict)
        accuracy = weightdict
        accuracy['accuracy'] = percentage
        accuracy_list = accuracy_list+[accuracy]
        

CPU times: user 1h 7min, sys: 27.6 s, total: 1h 7min 28s
Wall time: 1h 14min 42s


In [26]:
import json
with open('accuracy_sel.json', 'w') as fp:
    json.dump(accuracy_list, fp)


In [27]:
accuracy_df = pd.DataFrame(accuracy_list)
accuracy_df = accuracy_df.sort('accuracy',ascending=False)
accuracy_df

Unnamed: 0,Author,Publisher,Title,accuracy
122,0.35,0.50,0.15,0.028266
179,0.60,0.15,0.25,0.028266
177,0.60,0.25,0.15,0.028266
176,0.60,0.30,0.10,0.028266
175,0.60,0.35,0.05,0.028266
108,0.30,0.55,0.15,0.028266
40,0.10,0.85,0.05,0.028266
107,0.30,0.60,0.10,0.028266
106,0.30,0.65,0.05,0.028266
170,0.55,0.20,0.25,0.028266


In [28]:
bestweightdict = dict(accuracy_df[accuracy_df['accuracy']==accuracy_df['accuracy'].max()].mean())
print bestweightdict

{'Publisher': 0.33157894736842108, 'Title': 0.12894736842105259, 'accuracy': 0.028266331658291455, 'Author': 0.53947368421052611}


In [33]:
#merge train and validation set
dftrainandval = pd.concat([dftrain,dfval])
dftrainandval = dftrainandval.reset_index()
dftrainandval.drop('index', axis=1, inplace=True)

#dataframe of all books in train and val data(no duplicates)
dfbookall = dftrainandval[['Author','Title','Publisher','ISBN']]
dfbookall = dfbookall[~dfbookall.duplicated()]
dfbookall = dfbookall.reset_index()
dfbookall.drop('index', axis=1, inplace=True)


In [34]:
author_entire_list_all = np.array([])
for a in np.unique(dfbookall['Author']):
    author_entire_list_all = np.append(author_entire_list_all,author_to_list(a))
author_entire_list_all = np.unique(author_entire_list_all)
author_entire_list_all.shape

(2351,)

In [35]:
#make new simmat for merged dataframe
#Make binary author lists: Nbooks X Nauthors matrix
#Use author_entire_list for index-author matching
Xauthor_all = np.zeros((dfbookall.index.shape[0],author_entire_list_all.shape[0]))
for ind in range(dfbookall.shape[0]):
    alist = author_to_list(dfbookall.iloc[ind]['Author'])
    for a in alist:
        aind = np.where(author_entire_list_all==a)[0][0]
        Xauthor_all[ind,aind] = 1
        
#Make binary publisher lists: Nbooks X Npublihser matrix
#use publisher_train or publisher_train_dict for index matching
publisher_list_all = np.unique(dfbookall['Publisher'])
Xpublisher_all = np.zeros((dfbookall.index.shape[0],publisher_list_all.shape[0]))
for ind in range(dfbookall.shape[0]):
    pub = dfbookall.iloc[ind]['Publisher']
    pind = np.where(publisher_list_all==pub)[0][0]
    Xpublisher_all[ind,pind] = 1

In [36]:
%%time
#Update similarity matrix for titles:
Nbooks_all =dfbookall.index.shape[0]
simmat_title_all = np.zeros((Nbooks_all,Nbooks_all))
Nbooks = dfbook.index.shape[0]
for i in range(Nbooks,Nbooks_all):
    for j in range(Nbooks):
        simmat_title_all[i,j] = similar(dfbookall.iloc[i]['Title'],dfbookall.iloc[j]['Title'])
        simmat_title_all[j,i] = simmat_title_all[i,j]
        
for i in range(Nbooks,Nbooks_all):
    simmat_title_all[i,i] = 1.0
    for j in range(i+1,Nbooks_all):
        simmat_title_all[i,j] = similar(dfbookall.iloc[i]['Title'],dfbookall.iloc[j]['Title'])
        simmat_title_all[j,i] = simmat_title_all[i,j]


CPU times: user 2min 45s, sys: 2.45 s, total: 2min 48s
Wall time: 2min 51s


In [37]:
simmat_author_all = Xauthor_all.dot(Xauthor_all.T)
simmat_publisher_all = Xpublisher_all.dot(Xpublisher_all.T)

In [38]:
#Estimate Test error
testerror = predict_percentage(dftrainandval,dftest,3,bestweightdict,
                               simmat_author=simmat_author_all,simmat_title=simmat_title_all,
                               simmat_publisher=simmat_publisher_all,dfbook=dfbookall)
print testerror

0.0242881072027


In [None]:
predict_percentage(dftrain,dftest,3,bestweightdict)

0.023450586264656615