In [3]:
import pandas as pd
import numpy as np
import os
import glob

In [12]:
# Get all csvs from /movie_plots directory
csvs = glob.glob('movie_plots/*.csv')

# Open all these csv into a single dataframe
df = pd.concat([pd.read_csv(f) for f in csvs])

# Create a list from the dataframe joining column `title` and `plot`
df['plot'] = df['plot'].str.replace('\n', ' ')
df['doc'] = df['title'] + '\n' + df['plot']
docs_list = df["doc"].tolist()

In [13]:
docs_list[0]

"'68 (film)\nThe father escaped the Soviet invasion of Budapest and now runs a Hungarian restaurant that is not doing well financially. The younger of his two sons is gay and struggling with coming out. His dad disowns him when he finally does. The older son is involved in the counterculture, gets kicked out of college, buys a motorcycle, starts dating a Maoist, and is also disowned by his father. The older of the sons runs afoul of an outlaw motorcycle club; the younger of the two sons gets drafted but is rejected because of his homosexuality. The older one joins his younger brother in a gay rights protest.  Major events of the year such as the assassination of Martin Luther King and the assassination of Robert F. Kennedy are interspersed throughout the plot and depicted in the film using stock footage."

In [19]:
# Add to qdrant database via èndpoint: localhost:6000/add_documents
# request body: {"documents": [], "index": "movie_plots"}
# documentModel: document: string, id: optional[str(uuid)], metadata: optional[dict], score: optional[float]
import requests
import json
from typing import List

def load_documents(documents:List[str], index:str="movie_plots"):
    """Loads documents into the qdrant database via the add_documents endpoint
    Args:
        documents (List[str]): List of documents to be loaded into the database
    """
    url = "http://localhost:6000/add_documents"
    headers = {
        'Content-Type': 'application/json'
    }
    data = {
        "documents": [{"document": doc} for doc in documents],
        "index": index
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    print(response.text)

In [20]:
load_documents(docs_list, index="movie_plots") # 66min

["3773d66c5b6d4c4da75b69dbdbcd95c6","5e1b3f34bcf54df190d0cb7063f17f79","3ec9ff9e54dc4e72a8b88d78b6a57a1b","af80590b7c9b4e5ab23fb8200faec051","980a9e12287540378c5bd4d84a29416f","7072eeb7f37946b18098ff2d6d2272f3","79b2dea00a6d4fcf98a3ff148c70a955","6ce29c5c7ccd4a9aa1b12417a2287101","e8d1e3240b45427eaaa074fd443022f9","359080eb529d4dbda959a4a56c4cb527","6b6de7db2b1043ffad83ede1ebc625ea","0c1a217f1ba342af9d251e4135c79957","0d6171161b144ddf8ace446eab32c264","bda26697465a4935838711903ed83a98","1beeddedd1ca419f86249337a4e409fe","76a3d3469d5a4245ab49f092e4c9c19a","ee68314f64d84a42b989da200581eca8","d19841d04ed446f582dd678fbf241a56","50a75adefba1467fbcf15752540d355e","86f2a94b067a4aaf88a011cb37506845","e09e332fd87941ff964ef4aab0b86c1a","a66c2d0aedc245e0a2f5cfac6ff35187","52ab6e59470142e18c3e36d9a1bc26d6","057c0c2b53a445119dfb4ed7b43f4445","324eaba57f50498eb0fb832b5fd4ce43","4bc170a1b12a4fd09a09596a5a94709e","5a367a51ca744773b3a2029a8f4c28b8","3e4b32929c654b1ab8ebf8eac3986762","fe80518ecc4c4859b2

In [21]:
# open csv as pandas dataframe
import pandas as pd
df = pd.read_csv("imdb_movies.csv")


In [25]:
# Save the dataframe as a list of documents
df = df.rename(
    columns={
        "names": "title",
        "date_x": "date released",
        "orig_title": "original title",
        "orig_lang": "original language",
        "budget_x":"budget",
        "score": "IMDB score"
    }
) # Rename certain columns
columns = df.columns
docs_list = [
    "".join([f"{col.title()}: {df[col][i]}\n" for col in columns])
    for i in range(len(df))
] # Create a list of documents from the dataframe


In [27]:
load_documents(docs_list, "imbd_movies")

["74117064021e4e60860638623f35be82","7f5cb92ae75f478ca2da8ce1e4eee715","32ba567608bf4dc0ad1fd3d7d3fb5fc0","1b5da33930a2457e8ef09c3b287d2e31","f68cedc5fba94ab4a9ab50fa055899ee","893bdf665b8146c49aac904d4278a172","96071d2358994880a926f359d91b8e8f","1e8022c6f52040de81e67ba17302a064","06b1690db85145369f025b99141f9087","6b35f5c9ef94477bbccf1968d9f74821","9039acdc5a144b588ac6ae0c6258b809","958734cf2622401384a9e459f696069f","e9078c49b6ed487582ddf713cdad2952","72411dc144284d1285b606e59bbab087","26d4e73031dd4470adbada5ed45c1f3f","ef33683d3b6e4e1586cda2e06fd7bebb","c831c42745b64175908ba1e4a0209607","fb4b66b3442949149ebf6462886b070f","7065a5f719dc40f7b1e1c4a05cd357af","ce920c48ddd54568a594228b7002a1d5","8757b04a2a3f4b7d9f0cbe56d4c017d8","99c3cf45b64145da9f5e48542e16cfe7","466f9b3de4f14ea6a893baeade2b8d76","a75f2d25b3414e28a25f28bd4b29904d","65e6b8c9e7b54c56a70cdaf8af23b5b4","a052657e64534049bb9c6bd6b1cbbfa8","424cd35641784f0fb02d93bc9e620aa0","5c81c925194b43c4b240692b50943516","b677c04fb38b420ea6