In [None]:
%%bash
. ~/.bashrc
python3 -m pip install -r ../requirements.txt
# Install necessary packages

In [138]:
import requests,json,csv,os

Movie_name = '8 mm'

API_key = '2f336ef7520a9304940fd71509ade61c'
#Use this method to return the tmdb-id (as a String) of a movie. It uses the defualt API_key which is
#set to my API_key. The input is a string of searchwords.

def get_tmdb_id_by_name(searchwords: str) -> str:
    #splits the string into each individual search word and creates query, by appending search words in query.
    words = searchwords.split(' ')
    query = 'https://api.themoviedb.org/3/search/movie/?api_key='+API_key+'&query='
    for i in range(len(words) - 1):
        query += words[i] + '+'
    query += words[len(words) - 1]

    #executes query, if response code is 200, then the query was successful
    response =  requests.get(query)
    if response.status_code==200: 
        array = response.json()
        #text contains the query as a json-file
        text = json.dumps(array)
        #print(text)
    else:
        print("No results for searchwords --> error")
        return ("No results for searchwords --> error")
    
    #dataset is json file converted into a list, containing the elements as either lists or maps
    dataset = json.loads(text)
    #extracts the movie id of the very first result
    # print(dataset)
    try:
        tmdb_id = dataset['results'][0]['id']
        return str(tmdb_id)
    except:
        return str(-1)
    # print(tmdb_id)

def get_tmdb_id_by_imdb_id(imdb_id:str) -> str:
    query = 'https://api.themoviedb.org/3/find/'+imdb_id+'?api_key='+API_key+'&external_source=imdb_id'
    response =  requests.get(query)
    if response.status_code==200:
        array = response.json()
        #text contains the query as a json-file
        text = json.dumps(array)
        #print(text)
    else:
        return str(-1)
    dataset = json.loads(text)
    #extracts the movie id of the very first result
    # print(dataset)
    try:
        tmdb_id = dataset['movie_results'][0]['id']
        return str(tmdb_id)
    except:
        return str(-1)

In [139]:
import numpy as np
pg_mapping = np.genfromtxt('./baseline/imdb_id_with_age_rating_list.txt', delimiter=',', dtype=str)

def conv_nan(row):
    if row[1] == 'no age rating found':
        row[1] = 'None'
    return row

# First apply the conv() filtering on the word count
pg_mapping = np.array([conv_nan(row) for row in pg_mapping])
pg_mapping

array([['tt0032138', 'None'],
       ['tt0035423', 'PG-13'],
       ['tt0038650', 'PG'],
       ['tt0045537', 'None']], dtype='<U19')

In [140]:
#Uses tmdb id as input and returns the age certification as a String
def get_metadata_by_tmdb_id(tmdb_id: str) -> str:
    if(tmdb_id == '-1'):
        return 'metadata not found'
    else:
        query = 'https://api.themoviedb.org/3/movie/'+tmdb_id+'?api_key='+API_key
        response =  requests.get(query)
        array = response.json()
        text = json.dumps(array)
        dataset = json.loads(text)
        return dataset

In [141]:
def get_age_certfication_by_tmdb_id(tmdb_id: str) -> str:
    age_rating = ''
    if(tmdb_id != '-1'):
        query = 'https://api.themoviedb.org/3/movie/'+tmdb_id+'/release_dates?api_key='+API_key
        response =  requests.get(query)
        array = response.json()
        text = json.dumps(array)
        dataset = json.loads(text)

        list_of_certs = dataset['results']
        index = -1
        # print(dataset)
        for i in range(len(list_of_certs)):
            if(list_of_certs[i]['iso_3166_1'] == 'US'):
                index = i
                break

        if(index != -1):
            age_rating = dataset['results'][index]['release_dates'][0]['certification']
    return age_rating

In [142]:
# Test query for one movie

def map_tmdb_metadata(imdb_id: str):
    tmdb_id = get_tmdb_id_by_imdb_id(imdb_id)
    if(tmdb_id == '-1'):
        return None
    else:
        metadata = get_metadata_by_tmdb_id(tmdb_id)
        # print(json.dumps(metadata, indent=2))
        if(metadata["id"] == None):
            return None
        else:
            metadata = {
                "tmdb_id": metadata["id"],
                "imdb_id": metadata["imdb_id"],
                "original_title": metadata["original_title"].replace("'", ""),
                "genres": metadata["genres"],
                "tagline": metadata["tagline"].replace("'", ""),
                "overview": metadata["overview"].replace("'", ""),
                "poster_path": metadata["poster_path"],
                "pg_rating": get_age_certfication_by_tmdb_id(tmdb_id),
            }
            return metadata
    # print(json.dumps(metadata, indent=2))

In [143]:
from tqdm import tqdm
inputFile = open('./baseline/movie1K_list.txt').read().split('\n')
elastic_output_file = open("elastic_metadata_init.json", "w")

output = []
result = {}


for idx, line in tqdm(enumerate(inputFile), total=len(inputFile)):
    try:
        imdb_id = line.strip()
        metadata = map_tmdb_metadata(imdb_id)
        output.append(metadata)
        value_str = str(metadata).replace("'", '"')
        elastic_output_file.write('{"index":{"_index": "movie_metadata", "_type":"_doc", "_id": "' + str(idx + 1) + '"}}' + '\n')
        elastic_output_file.write(value_str + '\n')    
    except:
        print("error for {imdb_id}")
    # if len(output) == 5:
    #     break
result = {
    "movie_dataset": output
}

100%|██████████| 1101/1101 [09:12<00:00,  1.99it/s]


In [144]:
resultFile = open('./movie_metadata.json', 'w')
resultFile.write(str(result).replace("'", '"'))

647605

In [145]:
%%bash
. ~/.bashrc
curl -u elasticuser:blaubarsch5 -H "Content-Type:application/x-ndjson" -XPOST https://app.leon-remke.jakob-hennighausen.melkonyan-davit.de/api/_bulk?pretty --data-binary "@elastic_metadata_init.json"

bash: line 1: /Users/leonremke/.bashrc: No such file or directory
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


{
  "took" : 851,
  "errors" : true,
  "items" : [
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "1",
        "_version" : 2,
        "result" : "updated",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 5,
        "_primary_term" : 1,
        "status" : 200
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "2",
        "_version" : 2,
        "result" : "updated",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 6,
        "_primary_term" : 1,
        "status" : 200
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "3",
        "_version" : 2,
        "result" : "updated",
        "_shards" : {
          "total" : 2,
          "successful" :

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



        "status" : 400,
        "error" : {
          "type" : "mapper_parsing_exception",
          "reason" : "failed to parse",
          "caused_by" : {
            "type" : "json_parse_exception",
            "reason" : "Unexpected character ('c' (code 99)): was expecting comma to separate Object entries\n at [Source: (ByteArrayInputStream); line: 1, column: 396]"
          }
        }
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "313",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 296,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "314",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successf

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "655",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 614,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "656",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 615,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "657",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "971",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 909,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "972",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 910,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "movie_metadata",
        "_type" : "_doc",
        "_id" : "973",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "

100 1098k  100  397k  100  700k  53478  94350  0:00:07  0:00:07 --:--:--  106k


In [17]:
from elasticsearch7 import Elasticsearch
# index key for further queries
index = "raw_movies"
# Elasticsearch credentials
user = "elasticuser"
pw = "blaubarsch5"
host = "https://app.leon-remke.jakob-hennighausen.melkonyan-davit.de/api/"

es = Elasticsearch(
    hosts=host,
    http_auth=(user, pw)
)

es.info()

{'name': 'ubuntu-elastic-search',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'C3IT-gleTX-yJIPo-o8IxA',
 'version': {'number': '7.17.8',
  'build_flavor': 'default',
  'build_type': 'deb',
  'build_hash': '120eabe1c8a0cb2ae87cffc109a5b65d213e9df1',
  'build_date': '2022-12-02T17:33:09.727072865Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [18]:
def get_rating(severities, swear_words):
    """Determine the age rating from the severities of the profanity words.
    Args:
        severities (list): List of the severities of the profanity words.
    Returns:
        str: The age rating.
    """
    if severities:
        if severities[0] == 0 and severities[1] <= 1 and severities[2] == 0 and severities[3] == 0:
            return "G"
        elif severities[0] == 0 and severities[1] <= 1 and severities[2] <= 1 and severities[3] == 0:
            return "PG"
        elif severities[0] <= 1 and severities[1] <= 1 and severities[3] <= 2 and len(swear_words) < 2:
            return "PG-13"
        elif severities[0] == 5 and severities[1] == 5 and severities[2] == 5 and severities[3] == 5:
            return "NC-17"
        else:
            return "R"
    else:
        return "G"

In [15]:
import tqdm
index = "movie_metadata"

# Read the textile file containing profanity counts for each document
with open('../baseline/output/imdb_id_with_profanity_list.txt', 'r') as f:
    lines = f.readlines()
success_count = 0
failure_count = 0
# Iterate through the lines and update each document with the new property
for line in lines:
    # Parse the line to extract the identifier and profanity counts
    line_parts = line.strip().split(',')
    imdb_id = line_parts[0]
    profanity_counts = {}
    for part in line_parts[1:]:
        key_value = part.strip().split(':')
        profanity_counts[key_value[0]] = int(key_value[1])
    
    # Define the search query to find the document with the matching identifier property
    search_query = {'query': {'match': {'imdb_id': imdb_id}}}
    
    # Retrieve the matching document from the index
    result = es.search(index=index, body=search_query)
    
    # Check if a single document was returned
    if result['hits']['total']['value'] == 1:
        doc_id = result['hits']['hits'][0]['_id']
        # Add the profanity counts as a new property to the document
        profanity_counts = {'profanity_counts': profanity_counts}
        try:
            es.update(index=index, id=doc_id, body={'doc': profanity_counts})
            success_count += 1
        except(Exception) as e:
            print('Error updating document with id: ' + doc_id + ' with error: ' + str(e))
            failure_count += 1
    else:
        failure_count += 1
        # print('No document found for identifier property: ' + imdb_id)
print('Successfully updated ' + str(success_count) + ' documents')
print('Failed to update ' + str(failure_count) + ' documents')


  result = es.search(index=index, body=search_query)
  es.update(index=index, id=doc_id, body={'doc': profanity_counts})


KeyboardInterrupt: 

In [31]:
import tqdm
import pandas as pd
el_index = "movie_metadata"

# Read the textile file containing profanity counts for each document
model_prediction = pd.read_csv('../../data/results_svm/svm_prediction.csv', sep=',', header='infer')
model_prediction.head()
success_count = 0
failure_count = 0
# Iterate through the lines and update each document with the new property
for index, line in model_prediction.iterrows():
    # Define the search query to find the document with the matching identifier property
    search_query = {'query': {'match': {'imdb_id': line["imdb_id"].strip()}}}
    
    # Retrieve the matching document from the index
    result = es.search(index=el_index, body=search_query)
    
    # Check if a single document was returned
    if result['hits']['total']['value'] == 1:
        doc_id = result['hits']['hits'][0]['_id']
        # Add the profanity counts as a new property to the document
        profanities = result['hits']['hits'][0]['_source'].get('profanity_counts', {})
        profanities = profanities if profanities is not None else []
        prediction_list = [line["prediction_alcohol"], line["prediction_frightening"], line["prediction_nudity"], line["prediction_profanity"], line["prediction_violence"]]
        prediction_dict = {
            "prediction_alcohol": line["prediction_alcohol"], 
            "prediction_frightening": line["prediction_frightening"],
            "prediction_nudity": line["prediction_nudity"],
            "prediction_profanity": line["prediction_profanity"],
            "prediction_violence": line["prediction_violence"],
            "prediction_rating": get_rating(prediction_list, profanities),
        }
       
        try:
            es.update(index=el_index, id=doc_id, body={'doc': prediction_dict})
            success_count += 1
        except(Exception) as e:
            print('Error updating document with id: ' + doc_id + ' with error: ' + str(e))
            failure_count += 1
    else:
        failure_count += 1
        # print('No document found for identifier property: ' + imdb_id)
print('Successfully updated ' + str(success_count) + ' documents')
print('Failed to update ' + str(failure_count) + ' documents')


  result = es.search(index=el_index, body=search_query)
  es.update(index=el_index, id=doc_id, body={'doc': prediction_dict})


Successfully updated 440 documents
Failed to update 35 documents
