# Load Data into Solr

Before doing this, we need to have Solr running. We downloaded a fresh solr-7.4.0 distribution for this, to prevent any bad interaction with existing indexes.

    cd <solr_home>
    bin/solr start
    
We then create a new core to hold our index.

    bin/solr create -c tmdbindex 
    
Since Solr 6.x the default similarity has changed to be BM25, and the old TF-IDF based similarity can be accessed using ClassicSimilarity. So we will declare a new field type text_tfidf that uses the TF-IDF based similarity. Any field declared (via the dynamic field naming convention) with the suffix `_t` will automatically be of type `text_general` which uses BM25 similarity. In addition, we declare a copy-field that will copy the title `title_t` and body field `description_t` (BM25 similarity) fields to their corresponding counterparts `title_tfidf` and `description_tfidf` automatically.

    cd ../scripts
    sh ./solr-schema.sh
    
In case you need to start over, the command to drop the core is as follows.

    bin/solr delete -c tmdbindex
    

In [1]:
import csv
import json
import os
import requests
import sqlite3
import sys

In [2]:
DATA_DIR = "../../data"

MOVIES_DATA = os.path.join(DATA_DIR, "movies_metadata.csv")
LOOKUPS_DB = os.path.join(DATA_DIR, "lookups.db")

SOLR_URL = "http://localhost:8983/solr/tmdbindex/"

In [3]:
def get_keywords(conn, movie_id):
    cur = conn.cursor()
    cur.execute("select keywords from keywords where mid = ?", [movie_id])
    rows = cur.fetchall()
    keywords = []
    if len(rows) > 0:
        for row in rows:
            keywords = row[0].split("|")
            break
    cur.close()
    return keywords


def filter_genres(conn, genres):
    filtered_genres = []
    cur = conn.cursor()
    for genre in genres:
        cur.execute("select gname from genres where gname = ?", [genre])
        rows = cur.fetchall()
        if len(rows) == 0:
            continue
        filtered_genres.append(genre)
    cur.close()
    return filtered_genres


conn = sqlite3.connect(LOOKUPS_DB)
print(get_keywords(conn, 460870))

['electricity', 'scientific experiment', 'nikola tesla']


In [4]:
def get_float(orig_value, default_value):
    if orig_value is None:
        return default_value
    elif len(orig_value.strip()) == 0:
        return default_value
    else:
        return float(orig_value)
    
def parse_genres(genre_json):
    if len(genre_json.strip()) == 0:
        return []
    names = []
    idname_pairs = json.loads(genre_json.replace("'", "\""))
    for idname_pair in idname_pairs:
        names.append(idname_pair["name"])
    return names


def add_record_to_solr(solr_url, doc_id, title, description, popularity, 
                       release_date, revenue, runtime, rating, keywords, genres,
                       should_commit=False):
    headers = {
        "content-type": "application/json",
        "accept": "application/json"
    }
    if doc_id is None:
        # only do a commit
        requests.post(solr_url + "update", params={"commit": "true"}, headers=headers)
    else:
        req_body = json.dumps({
            "add": {
                "doc": {
                    "id": doc_id,
                    "title_t": title,
                    "description_t": description,
                    "popularity_f": popularity,
                    "released_dt": release_date,
                    "revenue_f": revenue,
                    "runtime_f": runtime,
                    "rating_f": rating,
                    "keywords_ss": keywords,
                    "genres_ss": genres
                }
            }
        })
        params = { "commit": "true" if should_commit else "false" }
        requests.post(solr_url + "update", data=req_body, params=params, headers=headers)
        

i = 0
should_commit = False
with open(MOVIES_DATA, "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if i % 1000 == 0:
            print("{:d} records ingested into Solr".format(i))
            should_commit = True
        if row["original_language"] != "en":
            # only stick to english
            i += 1
            continue
        doc_id = int(row["id"])
        title = row["original_title"]
        description = row["overview"]
        popularity = get_float(row["popularity"], 0.0)
        release_date = row["release_date"]
        revenue = get_float(row["revenue"], 0.0)
        runtime = get_float(row["runtime"], 0.0)
        rating = get_float(row["vote_average"], 0.0)
        # look up keywords
        keywords = get_keywords(conn, doc_id)
        # parse out genres
        genres = filter_genres(conn, parse_genres(row["genres"]))
        # add record to solr
        add_record_to_solr(SOLR_URL, doc_id, title, description, popularity, 
                           release_date, revenue, runtime, rating, keywords, genres,
                           should_commit=should_commit)
        should_commit = False
        i += 1

add_record_to_solr(SOLR_URL, None, None, None, None, None, None, None, None, None, None, True)
print("{:d} records ingested into Solr, COMPLETE".format(i))

0 records ingested into Solr
1000 records ingested into Solr
2000 records ingested into Solr
3000 records ingested into Solr
4000 records ingested into Solr
5000 records ingested into Solr
6000 records ingested into Solr
7000 records ingested into Solr
8000 records ingested into Solr
9000 records ingested into Solr
10000 records ingested into Solr
11000 records ingested into Solr
12000 records ingested into Solr
13000 records ingested into Solr
14000 records ingested into Solr
15000 records ingested into Solr
16000 records ingested into Solr
17000 records ingested into Solr
18000 records ingested into Solr
19000 records ingested into Solr
20000 records ingested into Solr
21000 records ingested into Solr
22000 records ingested into Solr
23000 records ingested into Solr
24000 records ingested into Solr
25000 records ingested into Solr
26000 records ingested into Solr
27000 records ingested into Solr
28000 records ingested into Solr
29000 records ingested into Solr
30000 records ingested 