In [0]:
import pandas as pd
import numpy as np
import re
import time
from datasketch import MinHash, MinHashLSHForest

# DATA PREPROCESSING

In [0]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [7]:
text = 'The devil went down to Georgia'
print('The shingles (tokens) are:', preprocess(text))

The shingles (tokens) are: ['the', 'devil', 'went', 'down', 'to', 'georgia']


In [0]:
#CHOOSE YOUR PARAMETERS

#Number of Permutations
permutations = 128

#Number of Recommendations to return
num_recommendations = 1

# CREATING MINHASH FOREST

In [0]:
def create_forest(data, perms):
  start_time = time.time()
  
  minhash = []
  
  for text in data['text']:
    tokens = preprocess(text)
    m = MinHash(num_perm = perms) #setting the permutation in minhash
    for s in tokens:
      m.update(s.encode('utf-8'))
    minhash.append(m)
    
  forest = MinHashLSHForest(num_perm = perms)
  
  for i,m in enumerate(minhash):
        forest.add(i,m)
     
  forest.index()
  print('It took %s seconds to build forest.' %(time.time()-start_time))
    
  return forest

# Evaluating forest

In [0]:
def predict(text, database, perms, num_results, forest):
  start_time = time.time()
  
  tokens = preprocess(text)
  m = MinHash(num_perm=perms)
  for s in tokens:
    m.update(s.encode('utf8'))
    
  idx_array = np.array(forest.query(m, num_results))
  
  if len(idx_array) == 0:
        return None # if your query is empty, return none
    
  result = database.iloc[idx_array]['title']
  
  print('It took %s seconds to query forest.' %(time.time()-start_time))
    
  return result

# Testing Our Recommendation Engine on NIPS Conference Papers

In [0]:
!pip install kaggle
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d benhamner/nips-papers

In [0]:
!unzip nips-papers.zip

In [22]:
df = pd.read_csv("papers.csv")
df['text'] = df['title']+" "+df['abstract']
forest = create_forest(df, permutations)

It took 13.716139793395996 seconds to build forest.


In [29]:
num_recommendations = 5
title = 'Artificial Network'
result = predict(title, df, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.007211923599243164 seconds to query forest.

 Top Recommendation(s) is(are) 
 5443    A Generalization of Submodular Cover via the D...
4485                                   Compete to Compute
6794        Do Deep Neural Networks Suffer from Crowding?
5365    Inferring Algorithmic Patterns with Stack-Augm...
5918    Theoretical Comparisons of Positive-Unlabeled ...
Name: title, dtype: object
