In [1]:
import collections
import csv
import json
import os
import sqlite3

In [2]:
DATA_DIR = "../../data"
MOVIES_DATA = os.path.join(DATA_DIR, "movies_metadata.csv")
KEYWORDS_DATA = os.path.join(DATA_DIR, "keywords.csv")

LOOKUPS_DB = os.path.join(DATA_DIR, "lookups.db")

In [3]:
i = 0
unique_genres = set()
with open(MOVIES_DATA, "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if i % 1000 == 0:
            print("{:d} movies read".format(i))
        genres_val = row["genres"]
        genres_val = genres_val.replace("'", "\"")
        genres_json = json.loads(genres_val)
        for genre_idname in genres_json:
            genre_name = genre_idname["name"]
            if len(genre_name.split(" ")) > 1:
                continue
            unique_genres.add(genre_name)
        i += 1
print("{:d} movies read, COMPLETE".format(i))
print(unique_genres)
print(len(unique_genres))

0 movies read
1000 movies read
2000 movies read
3000 movies read
4000 movies read
5000 movies read
6000 movies read
7000 movies read
8000 movies read
9000 movies read
10000 movies read
11000 movies read
12000 movies read
13000 movies read
14000 movies read
15000 movies read
16000 movies read
17000 movies read
18000 movies read
19000 movies read
20000 movies read
21000 movies read
22000 movies read
23000 movies read
24000 movies read
25000 movies read
26000 movies read
27000 movies read
28000 movies read
29000 movies read
30000 movies read
31000 movies read
32000 movies read
33000 movies read
34000 movies read
35000 movies read
36000 movies read
37000 movies read
38000 movies read
39000 movies read
40000 movies read
41000 movies read
42000 movies read
43000 movies read
44000 movies read
45000 movies read
45466 movies read, COMPLETE
{'History', 'Comedy', 'Animation', 'Mystery', 'GoHands', 'Action', 'Aniplex', 'War', 'Family', 'Thriller', 'Horror', 'Crime', 'Drama', 'Adventure', 'Foreign'

In [4]:
def table_exists(conn, table_name):
    cur = conn.cursor()
    cur.execute("select name from sqlite_master where type='table' and name = ?", 
                [table_name])
    rows = cur.fetchall()
    cur.close()
    return len(rows) > 0


def create_genres_table(conn):
    if not table_exists(conn, "genres"):
        cur = conn.cursor()
        create_table = """
            CREATE TABLE genres(
                gid INTEGER NOT NULL PRIMARY KEY,
                gname VARCHAR(32) NOT NULL
            )
        """
        create_index = """
            CREATE UNIQUE INDEX ux1_genres ON genres(gname)
        """
        cur.execute(create_table)
        cur.execute(create_index)
        cur.close()


def insert_genre(conn, gid, gname):
    cur = conn.cursor()
    insert_sql = """
        INSERT INTO genres(gid, gname) VALUES (?, ?)
    """
    cur.execute(insert_sql, [gid, gname])
    cur.close()



conn = sqlite3.connect(LOOKUPS_DB)
create_genres_table(conn)

In [5]:
num_unique_genres = 0
for idx, genre_name in enumerate(list(unique_genres)):
    print(idx, genre_name)
    insert_genre(conn, idx, genre_name)
    num_unique_genres += 1

print("number of unique genres: {:d}".format(num_unique_genres))

conn.commit()
conn.close()

0 History
1 Comedy
2 Animation
3 Mystery
4 GoHands
5 Action
6 Aniplex
7 War
8 Family
9 Thriller
10 Horror
11 Crime
12 Drama
13 Adventure
14 Foreign
15 Western
16 Romance
17 Fantasy
18 Documentary
19 Music
number of unique genres: 20
