In [1]:
import numpy as np
import pandas as pd
import psycopg2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.spatial.distance import cosine

In [2]:
import pyspark as ps
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.sql import SQLContext

In [3]:
# connect to database
conn = psycopg2.connect(dbname='thingiscrape', user='robert', host='/tmp')
c = conn.cursor()

In [4]:
# query database
SQL = "SELECT DISTINCT * FROM thingi_items ORDER BY item_id LIMIT(1000)"
c.execute(SQL)
full_df = pd.DataFrame(c.fetchall(), columns = ['item_id','item_name','description','username'])

In [5]:
full_df['docs'] = full_df.pop('item_name') + ' ' + full_df.pop('description')

In [6]:
full_df.head()

Unnamed: 0,item_id,username,docs
0,2,replicator,"MakerBot Screwdriver It's a small screwdriver,..."
1,3,replicator,"MakerBot Plate It's an oblong sheet, about fou..."
2,5,replicator,MakerBot Hook It's a double-ended hook. Rather...
3,7,phooky,1st stellation of a dodecahedron This is a slo...
4,8,replicator,"MakerBot Knob This is a small, slightly blocky..."


In [7]:
sc = SparkContext('local[4]')
sqlContext = SQLContext(sc)

In [8]:
docs_rdd = sc.parallelize(full_df['docs'])

In [9]:
hashingTF = HashingTF(numFeatures=1000)
tf = hashingTF.transform(docs_rdd)

idf = IDF().fit(tf)

In [10]:
help(idf)

Help on IDFModel in module pyspark.mllib.feature object:

class IDFModel(JavaVectorTransformer)
 |  Represents an IDF model that can transform term frequency vectors.
 |  
 |  .. versionadded:: 1.2.0
 |  
 |  Method resolution order:
 |      IDFModel
 |      JavaVectorTransformer
 |      pyspark.mllib.common.JavaModelWrapper
 |      VectorTransformer
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  idf(self)
 |      Returns the current IDF vector.
 |      
 |      .. versionadded:: 1.4.0
 |  
 |  transform(self, x)
 |      Transforms term frequency (TF) vectors to TF-IDF vectors.
 |      
 |      If `minDocFreq` was set for the IDF calculation,
 |      the terms which occur in fewer than `minDocFreq`
 |      documents will have an entry of 0.
 |      
 |      Note: In Python, transform cannot currently be used within
 |            an RDD transformation or action.
 |            Call transform directly on the RDD instead.
 |      
 |      :param x: an RDD of term freque

In [None]:
tfidf = idf.transform(tf)

In [None]:
# transform names and descriptions
# tfidfvect = TfidfVectorizer(stop_words='english', max_features = 1000)
# documents_vec = full_df['item_name'] + ' ' + full_df['description']
# vec_X = tfidfvect.fit_transform(documents_vec).toarray()
# del documents_vec

In [None]:
def get_top_users_and_parts(user_ind, username):
    # df_user_removed = df.drop(user_ind, axis = 0, inplace = False)
    # X_user_removed = np.delete(X, user_ind, axis = 0)
    n_items = 5

    while True:
        similar_users = []
        similar_parts = []
        base_ids = full_df.iloc[user_ind]['item_id']
        
        for i in user_ind:
            # base_item = vec_X[i].reshape(1, -1)
            base_item_id = full_df.iloc[i]['item_id']
            # distances_vector = np.apply_along_axis(lambda x: cosine(x,base_item), 1, vec_X)
            # distances_vector = linear_kernel(base_item, vec_X)
            distances_vector = linear_kernel(vec_X[i:i+1], vec_X).flatten()
            similar_indices = np.argsort(distances_vector)[::-1]
            similar_items = full_df.iloc[np.ravel(similar_indices[:n_items])]

            similar_users.extend(list(similar_items['username']))
            similar_parts.extend(list(similar_items['item_id']))
        
        for item_id in base_ids:
            if item_id in similar_parts:
                similar_parts.remove(item_id)

        similar_users_set = set(similar_users)
        similar_parts_set = set(similar_parts)

        similar_users_set.discard(username)

        if len(similar_users_set) >= 5:
            break
        else:
            n_items += 2

    top_similar_users = sorted(similar_users_set, key = lambda x: similar_users.count(x), reverse = True)[:5]
    top_similar_parts = sorted(similar_parts_set, key = lambda x: similar_parts.count(x), reverse = True)[:5]
    return top_similar_users, top_similar_parts

In [None]:
username = 'phooky'

rec_start = time.time()
print "recommending!!!"
# username = str(request.form['user_input'])
user_ind = np.ravel(np.argwhere(full_df['username']==username))
print user_ind
users, parts = get_top_users_and_parts(user_ind, username)
rec_end = time.time()
print "time to recommend: {}".format(rec_end-rec_start)