In [26]:
import os

import pandas as pd
import numpy as np

from lxml import html
import requests

In [2]:
DATA_DIR = 'data/';

def getPaper(id):
    # gets the abstract and title from citeulike based on an id
    page = requests.get('http://www.citeulike.org/article-posts/' + str(id))
    tree = html.fromstring(page.content)
    title = tree.xpath('//h1/text()')[0] if len(tree.xpath('//h1/text()')) else ''
    abstract = tree.xpath('//*[@id="abstract-body"]/blockquote/p/text()')[0] if len(tree.xpath('//*[@id="abstract-body"]/blockquote/p/text()')) else ''
    return {'title': title, 'abstract': abstract}

In [9]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'citeulike_5k.csv'), header=0, sep=",");

# drop 'tag used' and 'datetime posted' columns
raw_data.drop(raw_data.columns[[2,3]], axis=1, inplace=True)

In [10]:
raw_data.head()

Unnamed: 0,Id,Username
0,42,61baaeba8de136d9c1aa9c18ec3860e8
1,42,61baaeba8de136d9c1aa9c18ec3860e8
2,42,61baaeba8de136d9c1aa9c18ec3860e8
3,42,61baaeba8de136d9c1aa9c18ec3860e8
4,43,61baaeba8de136d9c1aa9c18ec3860e8


In [11]:
# initially we won't worry about the "tags" feature.
# the way the data is currently laid out means that there is a new row for each tag a user added to a paper.
# thus, we need to collapse rows referring to the same user and item into one row. 
drops = []
for i in range(1, len(raw_data.index)):
    current = raw_data.loc[i]
    prev = raw_data.loc[i-1]
    if ((current.Id == prev.Id) & (current.Username == prev.Username)):
        drops.append(i)

filtered_data = raw_data.drop(raw_data.index[drops])
filtered_data.head()

Unnamed: 0,Id,Username
0,42,61baaeba8de136d9c1aa9c18ec3860e8
4,43,61baaeba8de136d9c1aa9c18ec3860e8
7,44,61baaeba8de136d9c1aa9c18ec3860e8
11,45,61baaeba8de136d9c1aa9c18ec3860e8
14,46,61baaeba8de136d9c1aa9c18ec3860e8


In [12]:
filtered_data['Id'] = filtered_data['Id'].astype(object)
filtered_data['Id'].describe()

count     2201
unique    1950
top       1223
freq         8
Name: Id, dtype: int64

In [13]:
filtered_data['Username'].describe()

count                                 2201
unique                                 310
top       071632f8c267c2efc3a84ce0a61bb9b1
freq                                   235
Name: Username, dtype: object

In [14]:
# need to filter out users with fewer than the minimum number of items in their library
min = 10
good_users = filtered_data[['Username', 'Id']].groupby('Username', as_index=False).filter(lambda x: len(x) > min)['Username'].tolist()

filtered_data = filtered_data[filtered_data['Username'].isin(good_users)].reset_index(drop=True)


In [15]:
filtered_data.describe()

Unnamed: 0,Id,Username
count,1524,1524
unique,1389,39
top,3247,071632f8c267c2efc3a84ce0a61bb9b1
freq,5,235


In [16]:
# need to split into training/validation and test

filtered_data

Unnamed: 0,Id,Username
0,42,61baaeba8de136d9c1aa9c18ec3860e8
1,43,61baaeba8de136d9c1aa9c18ec3860e8
2,44,61baaeba8de136d9c1aa9c18ec3860e8
3,45,61baaeba8de136d9c1aa9c18ec3860e8
4,46,61baaeba8de136d9c1aa9c18ec3860e8
5,47,61baaeba8de136d9c1aa9c18ec3860e8
6,48,61baaeba8de136d9c1aa9c18ec3860e8
7,49,61baaeba8de136d9c1aa9c18ec3860e8
8,50,61baaeba8de136d9c1aa9c18ec3860e8
9,51,61baaeba8de136d9c1aa9c18ec3860e8


In [17]:
# generate lookup table of document words
item_words = {}
for i in range(0, len(filtered_data.index)):
    item_id = filtered_data.loc[i, 'Id']
    if not item_id in item_words:
        print i/len(filtered_data.index)*100," percent complete \r",
        paper = getPaper(item_id)
        item_words[item_id] = (paper['title'] + paper['abstract']).encode("utf-8")



In [18]:
# export document words to csv
pd.DataFrame.from_dict(item_words, orient="index").to_csv('item_words.csv')

In [19]:
# convert user and item id's into matrix indices
u_map = {}
i_map = {}
i_index = 0
u_index = 0
for i in range(0, len(filtered_data.index)):
    user = filtered_data.loc[i, 'Username']
    item = filtered_data.loc[i, 'Id']
    if not user in u_map:
        u_map[user] = u_index
        u_index += 1
    if not item in i_map:
        i_map[item] = i_index
        i_index += 1
    filtered_data.loc[i, 'Username'] = u_map[user]
    filtered_data.loc[i, 'Id'] = i_map[item]

In [21]:
filtered_data.columns = ['iid', 'uid']
filtered_data.head()

Unnamed: 0,iid,uid
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [22]:
filtered_data.to_csv(os.path.join(DATA_DIR, 'user_ratings.csv'), index=False)

In [39]:
filtered_data.loc[0, 'iid']

0

In [50]:
max(filtered_data['uid'])

38

In [46]:
pd.read_csv(os.path.join(DATA_DIR, 'user_ratings.csv'))

Unnamed: 0,iid,uid
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0
