# Imports

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import DirectoryLoader
from langchain.indexes import SQLRecordManager, index
from pinecone import Pinecone, PodSpec
import os

from dotenv import load_dotenv
load_dotenv()

True

# Loading Data

In [2]:
loader = DirectoryLoader(
    path="./data", 
    glob="*.csv", 
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()

print(docs[20].page_content)

100%|██████████| 104/104 [00:04<00:00, 22.54it/s]

Title: The Saphead
Runtime (minutes): 77
Language: English
Overview: Nick Van Alstyne owns the Henrietta silver mine and is very rich. His son Bertie is naive and spoiled. His daughter Rose is married to shady investor Mark. Mark wrecks Bertie's wedding plans by making him take the blame for Mark's illegitimate daughter. Mark also nearly ruins the family business by selling off Henrietta stock at too low a price. Bertie, of all people, must come to the rescue on the trading floor.
Release Year: 1920
Genre: Drama, Comedy
Keywords: silent film
Actors: Beulah Booker, Buster Keaton, William H. Crane, Edward Jobson, Irving Cummings
Directors: Winchell Smith, Herbert Blaché
Stream: 
Buy: 
Rent: 
Production Companies: Metro Pictures Corporation
Website: Unknown





# Splitting Data

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [4]:
print(splits[9].page_content)

Title: The Parson's Widow
Runtime (minutes): 94
Language: Swedish
Overview: A young man is elected by a small village to be its parson. As part of his duties, he is required to marry the widow of the parson before him. This poses two problems--first, the widow is old enough to be his grandmother, and second, he is already engaged to another woman.
Release Year: 1920
Genre: Comedy, Drama
Keywords: jealousy, cemetery, wedding, love potion, drunkenness, parson, magic spell
Actors: Einar Röd, Olav Aukrust, Greta Almroth, Hildur Carlberg, Emil Helsengreen
Directors: Carl Theodor Dreyer
Stream: 
Buy: 
Rent: 
Production Companies: SF Studios
Website: Unknown


# Creating Embeddings and Uploading to Pinecone

In [7]:
index_name = "filmbot"

# Create empty index
PINECONE_KEY, PINECONE_INDEX_NAME = os.getenv(
    'PINECONE_API_KEY'), os.getenv('PINECONE_INDEX_NAME')

pc = Pinecone(api_key=PINECONE_KEY)

# Uncomment if index is not created already
# pc.create_index(
#     name="film-bot-index",
#     dimension=1536,
#     metric="cosine",
#     spec=PodSpec(
#         environment="gcp-starter"
#     )
# )

# Target index and check status
pc_index = pc.Index(index_name)
print(pc_index.describe_index_stats())

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

vectorstore = PineconeVectorStore(
    pc_index, embeddings
)

# Create record manager
namespace = f"pinecone/{index_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

record_manager.create_schema()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [9]:
def _clear():
    """
    Hacky helper method to clear content.
    """
    index([], record_manager, vectorstore,
          cleanup="full", source_id_key="source")

# Uncomment this line if you want to clear the Pinecone vectorstore
_clear()

index(splits, record_manager, vectorstore,
      cleanup="full", source_id_key="source")

InternalServerError: <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>

In [10]:
query = "Movies based on novels or books."
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

Release Year: 1926
Genre: Drama
Keywords: based on novel or book
Actors: Max Gülstorff, Walter Rilla, Paul Biensfeldt, Lili Damita, Jack Trevor
Directors: Michael Curtiz
Stream: 
Buy: 
Rent: 
Production Companies: Sascha-Film, Phoebus-Film
Website: Unknown


# Creating a Retriever

In [11]:
retriever = vectorstore.as_retriever(
    search_type="similarity", search_kwargs={"k": 6})

In [12]:
retrieved_docs = retriever.invoke(
    "What are some films with about a person who becomes a tyrannical president of Panem?")

In [13]:
len(retrieved_docs)

6

In [14]:
print(retrieved_docs[0].page_content)

Title: Nineteen Eighty-Four
Runtime (minutes): 113
Language: English
Overview: George Orwell's novel of a totalitarian future society in which a man whose daily work is rewriting history tries to rebel by falling in love.
Release Year: 1984
Genre: Drama, Science Fiction, Romance
Keywords: based on novel or book, dystopia, fascism, totalitarian regime, anarchist, brainwashing, orwellian
Actors: John Hurt, Richard Burton, Suzanna Hamilton, Cyril Cusack, Gregor Fisher
Directors: Michael Radford
Stream: Amazon Prime Video, Hoopla
Buy: Apple TV, Amazon Video, Google Play Movies, YouTube, Vudu, Microsoft Store
Rent: Apple TV, Amazon Video, Google Play Movies, YouTube, Vudu, Microsoft Store
Production Companies: Atlantic Releasing Corporation, Umbrella-Rosenblum Film Production, Virgin Benelux, Virgin Schallplatten, Virgin Films, 20th Century Fox
Website: Unknown
