In [1]:
import numpy as np
import pandas as pd
import psycopg2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.spatial.distance import cosine

In [2]:
import pyspark as ps
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.sql import SQLContext

In [4]:
# connect to database
conn = psycopg2.connect(dbname='thingiscrape', user='robert', host='/tmp')
c = conn.cursor()

In [5]:
# query database
num_records = 1000
SQL = "SELECT DISTINCT * FROM thingi_items ORDER BY item_id LIMIT({})".format(num_records)
c.execute(SQL)
full_df = pd.DataFrame(c.fetchall(), columns = ['item_id','item_name','description','username'])

In [6]:
full_df['docs'] = full_df.pop('item_name') + ' ' + full_df.pop('description')

In [7]:
full_df.head()

Unnamed: 0,item_id,username,docs
0,2,replicator,"MakerBot Screwdriver It's a small screwdriver,..."
1,3,replicator,"MakerBot Plate It's an oblong sheet, about fou..."
2,5,replicator,MakerBot Hook It's a double-ended hook. Rather...
3,7,phooky,1st stellation of a dodecahedron This is a slo...
4,8,replicator,"MakerBot Knob This is a small, slightly blocky..."


In [8]:
sc = SparkContext('local[4]')
sqlContext = SQLContext(sc)

In [9]:
docs_rdd = sc.parallelize(full_df['docs'])

In [10]:
docs_rdd.take(2)

["MakerBot Screwdriver It's a small screwdriver, extruded as a single piece. Probably no good for heavy jobs, but still handy (Did you know that the Phillips-head screwdriver was named after the legendary Crafty King Phillip of ancient times, whose sons went on to invent the bandsaw and the pre-sliced banana? Although, now that you look, this screwdriver is a Flathead. Never mind.) ",
 "MakerBot Plate It's an oblong sheet, about four inches long, slightly flared at the ends. You're not sure what its purpose is. One face is decorated with strange ridges, and the whole is pierced by several square holes, irregularly placed Attractive, though, isn't it? Sort of like a butterfly. "]

In [11]:
hashingTF = HashingTF(numFeatures=500)
tf = hashingTF.transform(docs_rdd)

idf = IDF().fit(tf)

In [12]:
tfidf = idf.transform(tf)

In [13]:
tfidf.take(2)

[SparseVector(500, {1: 0.5523, 14: 0.6492, 24: 0.6685, 31: 1.0887, 41: 1.4024, 44: 0.5541, 81: 0.643, 84: 1.1364, 104: 0.2482, 114: 0.7057, 121: 0.1924, 144: 0.7246, 154: 0.5597, 161: 0.069, 181: 0.8084, 194: 0.613, 201: 0.6646, 211: 0.445, 231: 0.9459, 234: 0.1142, 254: 2.0489, 271: 0.243, 274: 1.1341, 284: 0.8877, 294: 0.7646, 301: 1.9361, 311: 0.8829, 314: 1.6587, 321: 0.0, 324: 1.5551, 344: 0.052, 351: 0.1002, 374: 2.2936, 384: 0.2386, 391: 0.0841, 401: 0.9792, 411: 0.7885, 424: 0.04, 431: 1.439, 441: 1.8531, 451: 1.1983, 454: 0.5509, 464: 0.184}),
 SparseVector(500, {1: 0.2599, 4: 1.5428, 14: 1.9475, 24: 0.6685, 41: 0.9641, 44: 0.9973, 81: 0.6001, 84: 0.6494, 104: 0.2482, 114: 0.6049, 121: 0.1924, 144: 0.7246, 154: 0.7835, 161: 0.072, 194: 0.4087, 211: 0.445, 231: 0.6306, 234: 0.0902, 254: 2.0489, 264: 1.4492, 271: 0.3037, 274: 1.0147, 301: 0.6454, 314: 0.4976, 321: 0.0, 344: 0.034, 351: 0.0501, 384: 0.2105, 391: 0.0721, 411: 0.7885, 414: 1.4198, 424: 0.028, 431: 0.4111, 441: 2.22

In [14]:
type(tfidf.take(1)[0])

pyspark.mllib.linalg.SparseVector

In [15]:
# transform names and descriptions
# tfidfvect = TfidfVectorizer(stop_words='english', max_features = 1000)
# documents_vec = full_df['item_name'] + ' ' + full_df['description']
# vec_X = tfidfvect.fit_transform(documents_vec).toarray()
# del documents_vec

In [16]:
def get_top_users_and_parts(user_ind, username):
    # df_user_removed = df.drop(user_ind, axis = 0, inplace = False)
    # X_user_removed = np.delete(X, user_ind, axis = 0)
    n_items = 5

    while True:
        similar_users = []
        similar_parts = []
        base_ids = full_df.iloc[user_ind]['item_id']
        
        for i in user_ind:
            # base_item = vec_X[i].reshape(1, -1)
            base_item_id = full_df.iloc[i]['item_id']
            # distances_vector = np.apply_along_axis(lambda x: cosine(x,base_item), 1, vec_X)
            # distances_vector = linear_kernel(base_item, vec_X)
            distances_vector = linear_kernel(vec_X[i:i+1], vec_X).flatten()
            similar_indices = np.argsort(distances_vector)[::-1]
            similar_items = full_df.iloc[np.ravel(similar_indices[:n_items])]

            similar_users.extend(list(similar_items['username']))
            similar_parts.extend(list(similar_items['item_id']))
        
        for item_id in base_ids:
            if item_id in similar_parts:
                similar_parts.remove(item_id)

        similar_users_set = set(similar_users)
        similar_parts_set = set(similar_parts)

        similar_users_set.discard(username)

        if len(similar_users_set) >= 5:
            break
        else:
            n_items += 2

    top_similar_users = sorted(similar_users_set, key = lambda x: similar_users.count(x), reverse = True)[:5]
    top_similar_parts = sorted(similar_parts_set, key = lambda x: similar_parts.count(x), reverse = True)[:5]
    return top_similar_users, top_similar_parts

In [17]:
username = 'phooky'

rec_start = time.time()
print "recommending!!!"
# username = str(request.form['user_input'])
user_ind = np.ravel(np.argwhere(full_df['username']==username))
print user_ind
users, parts = get_top_users_and_parts(user_ind, username)
rec_end = time.time()
print "time to recommend: {}".format(rec_end-rec_start)

NameError: name 'time' is not defined