# SOEN363 Project

## Drop the DB Indecies **(Only for Debugging Purposes)**

In [None]:
DROP INDEX actors_index;

## Drop the DB Views **(Only for Debugging Purposes)**

In [None]:
DROP VIEW high_ratings;

DROP VIEW low_ratings;

## Drop the DB tables **(Only for Debugging Purposes)**

In [None]:
DROP TABLE actors;

DROP TABLE genres;

DROP TABLE tags;

DROP TABLE tag_names;

DROP TABLE movies;

# Assignment Questions
## 2a

In [None]:
CREATE TABLE movies (
    mid             INTEGER UNIQUE, 
    title           VARCHAR, 
    year            INTEGER, 
    rating          REAL CHECK (rating BETWEEN 0.0 AND 5.0), 
    num_ratings     INTEGER,
    PRIMARY KEY     (mid, title, year)
);

CREATE TABLE actors (
    mid             INTEGER REFERENCES movies(mid), 
    name            VARCHAR, 
    cast_position   INTEGER,
    PRIMARY KEY     (mid, name)
);

CREATE TABLE genres (
    mid             INTEGER REFERENCES movies(mid), 
    genre           VARCHAR,
    PRIMARY KEY     (mid, genre)
);

CREATE TABLE tag_names (
    tid             INTEGER, 
    tag             VARCHAR,
    PRIMARY KEY     (tid)
);

CREATE TABLE tags (
    mid             INTEGER REFERENCES movies(mid), 
    tid             INTEGER,
    PRIMARY KEY     (mid, tid),
    FOREIGN KEY     (tid) REFERENCES tag_names(tid)
);

COPY movies FROM '/srv/soen363/phase1/movies.dat';

COPY actors FROM '/srv/soen363/phase1/actors.dat';

COPY genres FROM '/srv/soen363/phase1/genres.dat';

COPY tag_names FROM '/srv/soen363/phase1/tag_names.dat';

COPY tags FROM '/srv/soen363/phase1/tags.dat';

## 3a

In [None]:
-- Get the movie titles starring "Daniel Craig", sorted in an ascending alphabetical order
SELECT DISTINCT M.title AS "Movie Titles"
FROM actors A, movies M
WHERE
    M.mid = A.mid AND
    A.name = 'Daniel Craig'
ORDER BY M.title ASC;

Movie Titles
A Kid in King Arthur's Court
Archangel
Casino Royale
Elizabeth
Enduring Love
Infamous
Lara Croft: Tomb Raider
Layer Cake
Munich
Quantum of Solace


## 3b

In [None]:
-- Get the cast of the movie titled "The Dark Knight", sorted in ascending alphabetical order
SELECT DISTINCT A.name AS "Cast"
FROM actors A, movies M
WHERE
    M.mid = A.mid AND
    M.title = 'The Dark Knight'
ORDER BY A.name ASC;

Cast
Aaron Eckhart
Adam Kalesperis
Aidan Feore
Andrew Bicknell
Andy Luther
Anthony Michael Hall
Ariyon Bakare
Beatrice Rosen
Bill Smille
Brandon Lambdin


## 3c

In [None]:
-- Get the distinct genres in the database, alongside their number of occurrences (only for 1000+ occurrences), sorted ascending numerically in occurrences
SELECT DISTINCT G.genre, COUNT(M.*) AS number_of_movies_in_genre
FROM genres G, movies M
WHERE G.mid = M.mid
GROUP BY G.genre
HAVING COUNT(M.*) > 1000
ORDER BY COUNT(M.*) ASC;

genre,number_of_movies_in_genre
Adventure,1003
Crime,1086
Action,1445
Romance,1644
Thriller,1664
Comedy,3566
Drama,5076


## 3d

In [None]:
-- For each year, print the movie title, year, and rating, sorted in the ascending order of year and the descending order of movie rating
SELECT M.title, M.year, M.rating
FROM movies M
WHERE M.num_ratings > 0999
ORDER BY M.year ASC, M.rating DESC;

title,year,rating
The Birth of a Nation,1915,3.3
Intolerance: Love's Struggle Throughout the Ages,1916,3.8
Broken Blossoms or The Yellow Man and the Girl,1919,3.7
Das Cabinet des Dr. Caligari.,1920,4.1
"Dr. Mabuse, der Spieler - Ein Bild der Zeit",1922,4.1
"Dr. Mabuse, der Spieler - Ein Bild der Zeit",1922,4.1
"Nosferatu, eine Symphonie des Grauens",1922,3.9
Häxan,1922,3.8
Nanook of the North,1922,3.7
Der letzte Mann,1924,4.1


## 3e

In [None]:
-- Get the titles of movies whose tags include at least one occurrence of the word "bad" and another of "good"
SELECT M.title
FROM movies M, tags T, tag_names TN
WHERE 
    M.mid = T.mid AND T.tid = TN.tid AND
    TN.tag IN (
        SELECT TN.tag 
        FROM tag_names TN
        WHERE TN.tag LIKE 'good%'
        UNION
        SELECT TN.tag 
        FROM tag_names TN
        WHERE TN.tag LIKE 'bad%'
    );

title
Chung Hing sam lam
Down Periscope
Bad Boys
Die Hard: With a Vengeance
Nine Months
Clerks.
Houseguest
Miracle on 34th Street
Pulp Fiction
Pulp Fiction


## 3f

In [None]:
-- i. Get the information for the movies with the highest number of ratings, sorted ascending by movie ID
SELECT M.mid, M.title, M.year, M.rating, M.num_ratings
FROM movies M
WHERE M.num_ratings IN (
    SELECT MAX(M.num_ratings)
    FROM movies M
)
ORDER BY M.mid;

mid,title,year,rating,num_ratings
4201,Pirates of the Caribbean: At World's End,2007,3.8,1768593
53125,Pirates of the Caribbean: At World's End,2007,3.8,1768593


In [None]:
-- ii. Get the information for the movies with the highest rating, sorted ascending by movie ID
SELECT M.mid, M.title, M.year, M.rating, M.num_ratings
FROM movies M
WHERE M.rating IN (
    SELECT MAX(M.rating)
    FROM movies M
)
ORDER BY M.mid;

mid,title,year,rating,num_ratings
4311,1732 Høtten,1998,5.0,5


In [None]:
-- iii. Get the information for the movies with BOTH the highest rating AND the highest number of ratings (if it exists)

From the data returned from the last two queries, we know that this isn't the case; the movie with the highest rating isn't among the movies with the highest number of ratings.

Highest number of ratings: mid 4201/53125, title Pirates of the Caribbean: At World's End

Highest rating: mid 4311, title 1732 Høtten

In [2]:
-- iv. Get the information for the movies with the lowest rating, sorted by ascending movie ID
SELECT M.mid, M.title, M.year, M.rating, M.num_ratings
FROM movies M
WHERE M.rating IN (
    SELECT MIN(M.rating)
    FROM movies M
    WHERE M.num_ratings > 0
)
ORDER BY M.mid;

mid,title,year,rating,num_ratings
4230,Too Much Sleep,1997,1.5,3


In [None]:
-- v. Get the information for the movies with BOTH the lowest AND the highest number of ratings (if it exists)

From the data returned from the last query, we know that the lowest rating (for a movie which has been rated more than 0 times) is 1.5
This means that it is not the case for the movie with the highest number of ratings (currently 1'768'593 ratings at 3.8) to be the movie with the lowest rating (currently 3 ratings at 1.5).

In [None]:
-- vi. Present your findings on the hypothesis: are popular movies (the most ratings) among the best/worst movies (lowest or highest rating score)?

According to the results from the previous queries, it can be safely said that the hypothesis is false for this database.

## 3g

In [25]:
-- Get the year, title and rating of the lowest-rated and highest-rated movies for each year, between 2005 and 2011 inclusively. If tied, use ascending title

SELECT M2.year, M2.title, M2.rating
FROM movies M2
INNER JOIN (
    SELECT M1.year, MAX(M1.rating) as highest_rating
    FROM movies M1
    WHERE M1.year BETWEEN 2005 AND 2011 AND M1.num_ratings > 0
    GROUP BY M1.year
    ORDER BY M1.year ASC
) highest ON (highest.year = M2.year AND highest.highest_rating = M2.rating)

UNION

SELECT M4.year, M4.title, M4.rating
FROM movies M4
INNER JOIN (
    SELECT M3.year, MIN(M3.rating) as lowest_rating
    FROM movies M3
    WHERE M3.year BETWEEN 2005 AND 2011 AND M3.num_ratings > 0
    GROUP BY M3.year
    ORDER BY M3.year ASC
) lowest ON (lowest.year = M4.year AND lowest.lowest_rating = M4.rating)

ORDER BY year ASC, rating ASC, title ASC

year,title,rating
2005,Alone in the Dark,2.1
2005,Son of the Mask,2.1
2005,No Direction Home: Bob Dylan,4.3
2006,Basic Instinct 2,2.5
2006,Bug,2.5
2006,Doogal,2.5
2006,Das Leben der Anderen,4.4
2007,D-War,2.3
2007,Byôsoku 5 senchimêtoru,4.3
2007,No End in Sight,4.3


Only movies with more than 0 ratings were considered in this query. As no ratings were given to movies in 2010 and 2011, no movies from those years appear in the results. Removing the constraint will make the results appear, albeit with their ratings being 0.0.

## 3h

In [None]:
-- High Ratings view to get actors that have participated in a movie with a rating >= 4
CREATE OR REPLACE VIEW high_ratings AS
    SELECT DISTINCT A.name 
    FROM actors A, movies M
    WHERE A.mid = m.mid AND m.rating >= 4; 

-- Low Ratings view to get actors that have participated in a movie with a rating < 4
CREATE OR REPLACE VIEW low_ratings AS
    SELECT DISTINCT A.name 
    FROM actors A, movies M
    WHERE A.mid = m.mid AND m.rating < 4;

-- Get the actors count coming from High Ratings 
SELECT COUNT(*) as result_count
FROM high_ratings;

-- Get the actors count coming from Low Ratings
SELECT COUNT(*) as result_count
FROM low_ratings;

In [None]:
-- Get the number of No Flop actors in the database
SELECT COUNT(*)
FROM high_ratings
WHERE name NOT IN (
    SELECT name
    FROM low_ratings
);

In [None]:
-- Get the name and movie count of the Top 10 No Flop actors
SELECT R.name, COUNT(A.*) as movie_count
FROM high_ratings R, actors A
WHERE 
    R.name NOT IN (
        SELECT name
        FROM low_ratings
    ) AND 
    A.name = R.NAME
GROUP BY R.name
ORDER BY COUNT(A.*) DESC
LIMIT 10;

## 3i

In [None]:
SELECT 
    A.name, 
    MIN(M.year) as start, 
    MAX(M.year) as end, 
    MAX(M.year) - MIN(M.year) as longevity
FROM actors A, movies M
WHERE A.mid = M.mid
GROUP BY A.name 
ORDER BY MAX(M.year) - MIN(M.year) DESC
LIMIT 1;

## 3j

In [4]:
-- Returns the distinct names of actors who played in at least one movie with Annette Nicole.
CREATE VIEW co_actors AS
SELECT DISTINCT name
FROM actors 
WHERE mid IN (SELECT mid FROM actors
            WHERE name = 'Annette Nicole');

-- Print the number of rows in this view. 
SELECT COUNT(name)
FROM co_actors;

count
179


In [5]:
-- Returns all possible combinations of co_actors and the movie ids in which Annette Nicole played.
CREATE VIEW all_combinations AS
SELECT DISTINCT co_actors.name, actors.mid
FROM co_actors, actors
WHERE actors.mid IN (SELECT mid FROM actors
            WHERE name = 'Annette Nicole')
ORDER BY co_actors.name;

-- Print the number of rows in this view. 
SELECT COUNT(name)
FROM all_combinations;

count
537


In [6]:
-- Remove all legitimate pairs from all_combinations.
CREATE VIEW non_existent AS
SELECT * FROM all_combinations
WHERE NOT EXISTS (SELECT * 
FROM actors
WHERE all_combinations.name = actors.name AND all_combinations.mid = actors.mid);

-- Print the number of rows in this view.
SELECT COUNT(name)
FROM non_existent;

count
239


In [3]:
-- Eliminate the distinct actors from co_actors that appear in the view non_existent.
-- Print the names of all co_actors except Annette Nicole.

SELECT * FROM co_actors
WHERE NOT EXISTS (SELECT * 
FROM non_existent
WHERE non_existent.name = co_actors.name) AND NOT co_actors.name = 'Annette Nicole';

name
Christian Perry
Kristen Connolly


## 3k

## 3l

## 3m

In [None]:
-- Detecting duplicates
SELECT title, year, rating, num_ratings, COUNT(*) occurrences
FROM movies
GROUP BY title, year, rating, num_ratings
HAVING COUNT(*) > 1;

title,year,rating,num_ratings,occurrences
Singin' in the Rain,1952,3.9,28033.0,3
The Last American Virgin,1982,3.3,3330.0,2
The Best Man,2005,0.0,0.0,2
Chocolat,2000,3.6,62492.0,2
The Bridges of Madison County,1995,3.6,11573.0,2
Mighty Joe Young,1998,2.8,10755.0,2
Bill & Ted's Bogus Journey,1991,3.0,17201.0,2
The Devil-Doll,1936,0.0,0.0,2
Gegen die Wand,2004,0.0,0.0,2
Stuart Little 2,2002,2.7,10893.0,2


In [None]:
-- Creating a view that contains no duplicates
CREATE VIEW no_duplicates AS 
SELECT DISTINCT title, year, rating, num_ratings
FROM movies
GROUP BY title, year, rating, num_ratings;

## 4a

In [None]:
CREATE INDEX actors_index ON actors(name);

## 4b

In [None]:
-- Profiling Results:

--  TBD

## 4c

## 4d