<a href="https://colab.research.google.com/github/sololeveler/cmpe255-assignment-ann/blob/master/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

def get_ratings_labels():
    return get_data()[0].fieldnames  

def get_books_labels():
    return get_data()[1].fieldnames      

In [None]:
import json
from itertools import islice

ratings, book_features = get_data()

In [None]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [None]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))
    item_labels = json.dumps(line, indent=4)

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


Obtain the lables for the vectors

In [None]:
item_labels = []
for x in get_book_features():
  item_labels.append(x['Book-Title'])
  


In [None]:
print(len(item_labels))

271379


In [None]:
!pip install LightFM
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

Collecting LightFM
  Downloading lightfm-1.16.tar.gz (310 kB)
[?25l[K     |█                               | 10 kB 25.6 MB/s eta 0:00:01[K     |██▏                             | 20 kB 30.4 MB/s eta 0:00:01[K     |███▏                            | 30 kB 36.6 MB/s eta 0:00:01[K     |████▎                           | 40 kB 21.7 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 17.1 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 13.8 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 13.7 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 14.8 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 14.9 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 11.0 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 11.0 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 11.0 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 11.0 MB/s eta 0:00:01[K 

In [None]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [None]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [None]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [None]:
print(interactions.shape)

(105283, 341762)


Obtain item features

In [None]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features())) 
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [None]:
print(item_features.shape)

(341762, 443805)


In [None]:
from lightfm import LightFM

model = LightFM(learning_rate=0.05, loss='warp', no_components=64, item_alpha=0.001)
model.fit(interactions, item_features=item_features)
item_vectors = item_features * model.item_embeddings

In [None]:
print(item_vectors.shape)

(341762, 64)


In [None]:
import pickle 
!pip install faiss-cpu --no-cache
import faiss

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 8.8 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.1.post2


In [None]:
with open('books.pickle', 'wb') as f:
    pickle.dump({"name": item_labels, "vector": item_vectors}, f)

Describe the vector

In [None]:
def load_data():
    with open('books.pickle', 'rb') as f:
        data = pickle.load(f)
    return data

data = load_data()
vectors = data["vector"]
names = data["name"]
data

{'name': ['Classical Mythology',
  'Clara Callan',
  'Decision in Normandy',
  'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
  'The Mummies of Urumchi',
  "The Kitchen God's Wife",
  "What If?: The World's Foremost Military Historians Imagine What Might Have Been",
  'PLEADING GUILTY',
  'Under the Black Flag: The Romance and the Reality of Life Among the Pirates',
  "Where You'll Find Me: And Other Stories",
  'Nights Below Station Street',
  "Hitler's Secret Bankers: The Myth of Swiss Neutrality During the Holocaust",
  'The Middle Stories',
  'Jane Doe',
  "A Second Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series)",
  'The Witchfinder (Amos Walker Mystery Series)',
  'More Cunning Than Man: A Social History of Rats and Man',
  'Goodbye to the Buttermilk Sky',
  'The Testament',
  'Beloved (Plume Contemporary Fiction)',
  "Our Dumb Century: The Onion Presents 100 Years of Headlines from America's Finest News 

In [None]:
faiss.MatrixStats(vectors).comments.split("\n")

['analyzing 341762 vectors of size 64',
 'no NaN or Infs in data',
 'all vectors are distinct',
 'range of L2 norms=[3.77163e-08, 1.6998e+07] (0 null vectors)',
 'vectors have very large differences in norms, is this normal?',
 'matrix contains no 0s',
 'no constant dimensions',
 'no dimension has a too large mean',
 'stddevs per dimension are in [110.071 16076.6]',
 '']

Exhaustive Search Using Faiss

In [None]:
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

In [None]:
search_vector = vectors[80:81]
distances, indices = index.search(search_vector, 5)

In [None]:
print(f"The most similar movies to {names[80]} are:\n")
print([names[i] for i in indices[0]])

The most similar movies to Anil's Ghost are:



IndexError: ignored

LSH

In [None]:
data.keys()

In [None]:
class LSHIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, num_bits=8):
        self.index = faiss.IndexLSH(self.dimension, num_bits)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [None]:
index = LSHIndex(data["vector"], data["name"])
index.build()

In [None]:
index.query(data['vector'][80:81])

Product Quantization

In [None]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimention)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimention, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return [self.labels[i] for i in indices[0]]

In [None]:
index = IVPQIndex(data["vector"], data["name"])
index.build()

In [None]:
book_index = 80
book_vector = data['vector'][book_index:book_index+1]
print(f"The most simillar movies to {data['name'][book_index]} are:")
index.query(book_vector)

HNSW

In [None]:
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=5):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [None]:
!pip install nmslib
import nmslib
index = NMSLIBIndex(data["vector"], data["name"])
index.build()

In [None]:
book_vector, book_name = data['vector'][90], data['name'][90]
simlar_book_names = '\n* '.join(index.query(book_vector))
print(f"The most similar movies to {book_name} are:\n* {simlar_book_names}")

Trees and Graphs

In [None]:
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=5):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [None]:
!pip install annoy
import annoy
index = AnnoyIndex(data["vector"], data["name"])
index.build()

In [None]:
book_vector, book_name = data['vector'][90], data['name'][90]
simlar_book_names = '\n* '.join(index.query(book_vector))
print(f"The most similar movies to {book_name} are:\n* {simlar_book_names}")