In [14]:
# General Imports
import os
import random
import requests
import re
import string
from collections import defaultdict
import json
import airflow

# Scraping and dataframe imports
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import Levenshtein
import tmdbsimple as tmdb

In [15]:
## This will be the first in multiple scripts that are meant to scrape movie data, and feed a ML model
## It is being split by functionality in order to make it easier to version and use in orchestration

In [16]:
## API Key related functions
def closest_key(query, dictionary):
    closest_key = min(dictionary.keys(), key=lambda k: Levenshtein.distance(query, k))
    return dictionary[closest_key]
def find_repeated_first_elements(tuples_list):
    element_dict = defaultdict(list)
    repeated_elements = {}
    for t in tuples_list:
        element_dict[t[0]].append(t)
    
    for key, value in element_dict.items():
        if len(value) > 1:
            repeated_elements[key] = value
    
    return repeated_elements

In [19]:
### Scraping related functions
def get_soup(url):
    response = requests.get(url)
    page_content = response.content
    soup = BeautifulSoup(page_content, 'html.parser')
    return soup
def get_all_movies(url):
    soup = get_soup(url)
    movie_info = []

    for li in soup.find_all('li'):
        a = li.find('a')
        if a:
            title = a.get('title')
            href = a.get('href')

            year_match = re.search(r'\((\d{4})\)', li.text)
            year = year_match.group(1) if year_match else 'Unknown'
            if year == "Unknown":
                continue
            try:
                title = re.sub(r' \([^)]*(film|movie)[^)]*\)', '', title)
            except:
                continue
            movie_info.append((title, href, year))
    
    for li in soup.find_all('li'):
        main_title = li.find('i').text if li.find('i') else ''

        a_tags = li.find_all('a')
        for a in a_tags:
            title = a.get('title', main_title)
            href = a.get('href', '')

            year_match = re.search(r'(\d{4})', a.text)
            year = year_match.group(1) if year_match else 'Unknown'
            if year == "Unknown":
                continue
            title = re.sub(r' \([^)]*(film|movie)[^)]*\)', '', title)
            movie_info.append((title, href, year))
    return list(set(movie_info))

## Getting snippets off of the wikipedia pages
def chunk_text(text, n, overlap):
    if overlap >= n:
        return "Overlap must be smaller than chunk size"
    length = len(text)
    start = 0
    chunks = []
    while start < length:
        end = min(start + n, length)
        chunks.append(text[start:end])
        start += n - overlap
        
    return chunks

def get_page_content(href):
    soup = get_soup(f'https://en.wikipedia.org/{href}')
    article_container = soup.find('div', {'id': 'mw-content-text'})
    article_text = ''
    for paragraph in article_container.find_all('p'):
        article_text += paragraph.text
    infobox = soup.find('table', {'class': 'infobox'})

    infobox_text = ""

    if infobox:
        for row in infobox.find_all('tr'):
            # Extract the header if available
            th = row.find('th')
            if th:
                infobox_text += th.get_text() + "\n"

            # Extract the data in the row
            td = row.find('td')
            if td:
                infobox_text += td.get_text() + "\n"
    article_text += infobox_text
    div_tag = soup.find('div', {'class': 'div-col'})
    ul_tag = div_tag.find('ul') if div_tag else None
    list_text = ""
    if ul_tag:
        for li_tag in ul_tag.find_all('li'):
            list_text += li_tag.get_text()
    article_text += list_text
    return article_text

## *** This doesn't seem like it ever gets used? ***
def get_chunked_embeddings(href, embedding_model):
    page_content = get_page_content(href)
    embeddings = embedding_model.encode(chunker(page_content))
    return embeddings

## TMDB scraping related functions
def id2name(id_list):
    return [movie_genre_dict[idz] for idz in id_list]

def modify_metadata(metadata):
    keys_to_delete = []
    for key in metadata.keys():
        if metadata[key] is None:
            keys_to_delete.append(key)

    for key in keys_to_delete:
        del metadata[key]
    metadata["genre_ids"] = id2name(metadata["genre_ids"])
    return metadata

In [20]:
#Save a json file with api keys for tmdb, openai, pinecone, and postgres username/password
api_keys = json.load(open("./Credentials/api_keys.json"))
tmdb.API_KEY = api_keys["TMDB_API_KEY"]

In [None]:
#language codes tmdb uses
language_codes = {"af":"Afrikaans", "sq":"Albanian", "ar":"Arabic", "eu": "Basque", "bg":"Bulgarian",
                  "ca":"Catalan", "cn":"Chinese", "zh":"Chinese", "hr":"Croatian", "cs":"Czech", "da": "Danish", "nl":"Dutch",
                  "en":"English", "et": "Estonian", "fi":"Finnish", "fr":"French", "de": "German", "el": "Greek",
                  "he":"Hebrew", "hi": "Hindi", "hu":"Hungarian", "is":"Icelandic", "in": "Indonesian", "it": "Italian",
                  "ja": "Japanese", "ko": "Korean", "lv": "Latvian", "lt":"Lithuanian", "mk": "Macedonian", "ms": "malay",
                  "no": "Norwegian", "pl":"Polish", "pt": "Portugese", "rm": "Raeto-Romance", "ro":"Romanian", "ru":"Russian",
                  "sr": "Serbian", "sl": "Slovak", "sk": "Slovenian", "es": "Spanish", "sv": "Swedish", "tr": "Thai",
                  "th": "Turkish", "vi": "Vietnamese"}

In [6]:
#We get a list of all movies from wikipedia which serves as a source of information that will be utilized in the semantic
#search aspect of this tool

In [22]:
all_urls = [
    "https://en.wikipedia.org/wiki/List_of_films:_A",
    "https://en.wikipedia.org/wiki/List_of_films:_B",
    "https://en.wikipedia.org/wiki/List_of_films:_C",
    "https://en.wikipedia.org/wiki/List_of_films:_D",
    "https://en.wikipedia.org/wiki/List_of_films:_E",
    "https://en.wikipedia.org/wiki/List_of_films:_F",
    "https://en.wikipedia.org/wiki/List_of_films:_G",
    "https://en.wikipedia.org/wiki/List_of_films:_H",
    "https://en.wikipedia.org/wiki/List_of_films:_I",
    "https://en.wikipedia.org/wiki/List_of_films:_J%E2%80%93K",
    "https://en.wikipedia.org/wiki/List_of_films:_L",
    "https://en.wikipedia.org/wiki/List_of_films:_M",
    "https://en.wikipedia.org/wiki/List_of_films:_N%E2%80%93O",
    "https://en.wikipedia.org/wiki/List_of_films:_P",
    "https://en.wikipedia.org/wiki/List_of_films:_Q%E2%80%93R",
    "https://en.wikipedia.org/wiki/List_of_films:_S",
    "https://en.wikipedia.org/wiki/List_of_films:_T",
    "https://en.wikipedia.org/wiki/List_of_films:_U%E2%80%93W",
    "https://en.wikipedia.org/wiki/List_of_films:_X%E2%80%93Z"
]
all_movies = []
for url in all_urls:
    all_movies = all_movies + get_all_movies(url)

In [None]:
search = tmdb.Search()

In [None]:
unembedded_data = {}
all_data = []
for index, movie in tqdm(enumerate(d_subset), total=len(d_subset)):
    #How to make a search using TMDB
    tmdb_responses = search.movie(query=movie[0])["results"]
    if not len(tmdb_responses):
        print(f"{movie[0]} was not found in db")
        continue
    metadata = None
    for result in tmdb_responses:
        if result["release_date"][:4] == movie[2]:
            metadata = result
    if metadata == None:
        continue
    metadata = modify_metadata(metadata)
    page_content = get_page_content(movie[1])
    chunks = chunk_text(page_content, 384, 20)
    chunked_embeddings = dense_model.encode(chunks)
    input_ids = tokenizer(
        chunks, return_tensors='pt',
        padding=True, truncation=True
    )
    with torch.no_grad():
        sparse_vecs = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    for index, embedding in enumerate(chunked_embeddings):
        new_id = f"{metadata['id']}-{index}"
        dense_embedding = embedding.tolist()
        sparse_vec = sparse_vecs[index]
        indices = sparse_vec.nonzero().squeeze().cpu().tolist()  # positions
        values = sparse_vec[indices].cpu().tolist()  # weights/scores
        # build sparse values dictionary
        sparse_values = {
            "indices": indices,
            "values": values
        }
        all_data.append(
            {"id":new_id,
             "values":dense_embedding,
             "sparse_values": sparse_values,
             "metadata":metadata}
        )
        unembedded_data[new_id] = chunks[index]