# Lesson 3 - Recommender Systems

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pwd
%cd /content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/03_Recommender-Systems/
!ls -al

/content
/content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/03_Recommender-Systems
total 61
-rw------- 1 root root   748 Mar  4 08:43 DLAIUtils.py
-rw------- 1 root root  6148 Mar  4 08:32 .DS_Store
drwx------ 2 root root  4096 Mar  4 08:33 images
-rw------- 1 root root 50067 Mar  4 09:02 Lesson_3_Recommender_Systems.ipynb


![Project Description](./images/3_recommender_systems.png)

In [3]:
# !pip install -r /content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/requirements.txt

In [4]:
!pip install python-dotenv
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install sentence-transformers
!pip install matplotlib
!pip install torch
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install pinecone-datasets
!pip install pinecone-text
!pip install tiktoken
!pip install tqdm
!pip install datasets
!pip install deepface

Collecting urllib3<3,>=1.21.1 (from requests<3.0.0,>=2.25.0->pinecone-text)
  Using cached urllib3-2.2.1-py3-none-any.whl (121 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.18
    Uninstalling urllib3-1.26.18:
      Successfully uninstalled urllib3-1.26.18
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.31.17 requires urllib3<1.27,>=1.25.4, but you have urllib3 2.2.1 which is incompatible.[0m[31m
[0mSuccessfully installed urllib3-2.2.1
Collecting pyarrow>=12.0.0 (from datasets)
  Using cached pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 11.0.0
    Uninstalling pyarrow-11.0.0:
      Successfully uninstalled pyarrow-11.0.0
[31mERROR: pip's de

### Import the Needed Packages

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip all-the-news-3.zip

In [7]:
with open('../data/all-the-news-3.csv', 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [8]:
df = pd.read_csv('../data/all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


### Setup Pinecone

In [9]:
utils = Utils()
# PINECONE_API_KEY = utils.get_pinecone_api_key()   # Commented

# Below has been added
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

# print(f"PINECONE_API_KEY: {PINECONE_API_KEY}")

In [None]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME,
                      dimension=1536,
                      metric='cosine',
                      spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

### Connect to OpenAI

In [10]:
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

### 1.  Create Embeddings of the News Titles

In [11]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [None]:
CHUNK_SIZE=400
TOTAL_ROWS=10000

progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('../data/all-the-news-3.csv',
                     chunksize=CHUNK_SIZE,
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i),
                'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1

    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

In [None]:
index.describe_index_stats()

### Build the Recommender System

In [None]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed,
                             top_k=top_k,
                             include_metadata=True)
  return res

In [None]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

### 2.  Create Embeddings of All News Content

In [None]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME,
                      dimension=1536,
                      metric='cosine',
                      spec=ServerlessSpec(cloud='aws', region='us-west-2'))

articles_index = pinecone.Index(INDEX_NAME)

In [None]:
def embed(embeddings, title, prepped, embed_num):
  for embedding in embeddings.data:
    prepped.append({'id':str(embed_num),
                    'values':embedding.embedding,
                    'metadata':{'title':title}})
    embed_num += 1

    if len(prepped) >= 100:
        articles_index.upsert(prepped)
        prepped.clear()

  return embed_num

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>news_data_rows_num = 100</code>):</b> In this lab, we've initially set <code>news_data_rows_num</code> to 100 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 200, 400, 700, and 1000. You'll likely notice better and more relevant results.</p>

In [None]:
news_data_rows_num = 100

embed_num = 0       # keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                               chunk_overlap=20)    # how to chunk each article
prepped = []
df = pd.read_csv('../data/all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

In [None]:
articles_index.describe_index_stats()

### Build the Recommender System

In [None]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'