# SQL Assignment

In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.connect("Db-IMDB-Assignment.db")

## Sample Code

In [3]:
%%time
# Write your sql query below

query = """
        SELECT TRIM(Movie.title) AS 'Movie_Name'
        FROM Movie
        WHERE Movie.rating < 3
        """

q = pd.read_sql_query(query, conn)    
print(q.shape)
q.head()

(85, 1)
Wall time: 102 ms


Unnamed: 0,Movie_Name
0,Mastizaade
1,Dragonball Evolution
2,Loveyatri
3,Race 3
4,Gunday


## Q1 --- List all the directors who directed a 'Comedy' movie in a leap year. (You need to check that the genre is 'Comedy’ and year is a leap year) Your query should return director name, the movie name, and the year.

In [4]:
%%time
# Write your sql query below

query = """
        SELECT P.Name AS 'Director', M.title, M."year"
        FROM Person P 
        JOIN M_Director MD ON MD.PID = P.PID 
        JOIN Movie M ON M.MID = MD.MID 
        JOIN M_Genre MG ON MG.MID = M.MID
        JOIN Genre G ON G.GID = MG.GID 
        WHERE G.Name LIKE '%Comedy%'
        AND M."year" % 4 = 0
        """

q1 = pd.read_sql_query(query, conn)    
print(q1.shape)
q1.head()

(246, 3)
Wall time: 82.8 ms


Unnamed: 0,Director,title,year
0,Milap Zaveri,Mastizaade,2016
1,Danny Leiner,Harold & Kumar Go to White Castle,2004
2,Anurag Kashyap,Gangs of Wasseypur,2012
3,Frank Coraci,Around the World in 80 Days,2004
4,Griffin Dunne,The Accidental Husband,2008


## Q2 --- List the names of all the actors who played in the movie 'Anand' (1971)

In [5]:
%%time
# Write your sql query below

query = """
        SELECT P.Name
        FROM Person P 
        JOIN M_Cast MC ON P.PID = TRIM(MC.PID) 
        JOIN Movie M ON M.MID = MC.MID 
        WHERE M.title = 'Anand' AND M."year" = 1971
        """

q2 = pd.read_sql_query(query, conn)    
print(q2.shape)
q2.head()

(17, 1)
Wall time: 215 ms


Unnamed: 0,Name
0,Amitabh Bachchan
1,Rajesh Khanna
2,Brahm Bhardwaj
3,Ramesh Deo
4,Seema Deo


## Q3 --- List all the actors who acted in a film before 1970 and in a film after 1990. (That is: < 1970 and > 1990.)

In [6]:
%%time
# Write your sql query below

query = """
        SELECT P.Name AS 'Actor'
        FROM Person P
        JOIN M_Cast MC ON P.PID = TRIM(MC.PID)
        JOIN Movie M ON M.MID = MC.MID 
        WHERE M."year" < 1970 or M."year" > 1990
        """

q3 = pd.read_sql_query(query, conn)    
print(q3.shape)
q3.head()

(67747, 1)
Wall time: 409 ms


Unnamed: 0,Actor
0,Christian Bale
1,Cate Blanchett
2,John Benfield
3,Lorna Brown
4,Patrick Godfrey


## Q4 --- List all directors who directed 10 movies or more, in descending order of the number of movies they directed. Return the directors' names and the number of movies each of them directed.

In [7]:
%%time
# Write your sql query below

query = """
        SELECT P.Name, VW.Movie_Count 
        FROM Person P 
        JOIN 
        (SELECT MD.PID, COUNT(*) AS Movie_Count 
        FROM M_Director MD 
        GROUP BY MD.PID 
        HAVING COUNT(*) > 10 )VW ON P.PID = VW.PID;
        """

q4 = pd.read_sql_query(query, conn)    
print(q4.shape)
q4.head()

(45, 2)
Wall time: 19 ms


Unnamed: 0,Name,Movie_Count
0,Mahesh Manjrekar,15
1,Satish Kaushik,12
2,Anurag Kashyap,13
3,Yash Chopra,21
4,Subhash Ghai,18


## Q5.a --- For each year, count the number of movies in that year that had only female actors.

In [8]:
%%time
# Write your sql query below

query = """
        SELECT M."year", COUNT(*) AS Movie_Count 
        FROM Movie M
        JOIN
        (
            SELECT DISTINCT MID 
            FROM M_Cast  
            WHERE MID IN 
            (
                SELECT MC.MID 
                FROM M_Cast MC 
                JOIN Person P ON P.PID = TRIM(MC.PID)
                WHERE P.Gender = 'Female'
            )
        )VW ON VW.MID = M.MID
        GROUP BY M."year"
        """

q5a = pd.read_sql_query(query, conn)    
print(q5a.shape)
q5a.head()

(125, 2)
Wall time: 238 ms


Unnamed: 0,year,Movie_Count
0,1931,1
1,1936,3
2,1939,2
3,1941,1
4,1943,1


## Q5.b --- Now include a small change: report for each year the percentage of movies in that year with only female actors, and the total number of movies made that year. For example, one answer will be: 1990 31.81 13522 meaning that in 1990 there were 13,522 movies, and 31.81% had only female actors. You do not need to round your answer.

In [9]:
%%time
# Write your sql query below

query = """
        SELECT M.YEAR, I_VW.FEMALE_ONLY_CAST_MOVIE_COUNT, (I_VW.FEMALE_ONLY_CAST_MOVIE_COUNT*100/(COUNT(M.MID)*1.0)) AS PERCENTAGE_FEMALE_ONLY_CAST_MOVIE
        FROM Movie M 
        LEFT JOIN
        (
            SELECT M.YEAR, COUNT(*) AS FEMALE_ONLY_CAST_MOVIE_COUNT 
            FROM Movie M
            JOIN
            (
                SELECT DISTINCT MID 
                FROM M_Cast  
                WHERE MID IN 
                (
                    SELECT MC.MID 
                    FROM M_Cast MC 
                    JOIN Person P ON P.PID = TRIM(MC.PID)
                    WHERE P.Gender = 'Female'
                )
            )VW ON VW.MID = M.MID
            GROUP BY M.YEAR
        )I_VW ON M.YEAR = I_VW.YEAR
        GROUP BY M.YEAR
        """

q5b = pd.read_sql_query(query, conn)    
print(q5b.shape)
q5b.head()

(125, 3)
Wall time: 267 ms


Unnamed: 0,year,FEMALE_ONLY_CAST_MOVIE_COUNT,PERCENTAGE_FEMALE_ONLY_CAST_MOVIE
0,1931,1,100.0
1,1936,3,100.0
2,1939,2,100.0
3,1941,1,100.0
4,1943,1,100.0


## Q6 --- Find the film(s) with the largest cast. Return the movie title and the size of the cast. By "cast size" we mean the number of distinct actors that played in that movie: if an actor played multiple roles, or if it simply occurs multiple times in casts, we still count her/him only once.

In [10]:
%%time
# Write your sql query below

query = """
        SELECT VW.MID, M.TITLE, MAX(VW.CAST_COUNT) AS CAST_SIZE
        FROM
        (
            SELECT COUNT(*) AS CAST_COUNT, MID 
            FROM M_Cast GROUP BY MID 
        )VW 
        JOIN Movie M ON M.MID = VW.MID 
        """

q6 = pd.read_sql_query(query, conn)    
print(q6.shape)
q6.head()

(1, 3)
Wall time: 59.9 ms


Unnamed: 0,MID,title,CAST_SIZE
0,tt5164214,Ocean's Eight,238


## Q7 --- A decade is a sequence of 10 consecutive years. For example, say in your database you have movie information starting from 1965. Then the first decade is 1965, 1966, ..., 1974; the second one is 1967, 1968, ..., 1976 and so on. Find the decade D with the largest number of films and the total number of films in D.

In [11]:
%%time
# Write your sql query below

query = """
        SELECT DECADE, MAX(MOVIE_COUNTS)
        FROM 
        (
            SELECT DECADE, COUNT(*) AS MOVIE_COUNTS
            FROM
            (
                SELECT M."year", VW.MIN_YEAR, (((M."year" - VW.MIN_YEAR) / 10) + 1) AS DECADE 
                FROM Movie M 
                JOIN 
                (
                    SELECT MIN("year") AS MIN_YEAR 
                    FROM MOVIE
                )VW ON 1=1
            )I_VW
            GROUP BY DECADE
        )O_VW
        """

q7 = pd.read_sql_query(query, conn)    
print(q7.shape)
q7.head()

(1, 2)
Wall time: 4.06 ms


Unnamed: 0,DECADE,MAX(MOVIE_COUNTS)
0,8,1012


## Q8 --- Find all the actors that made more movies with Yash Chopra than any other director.

In [12]:
%%time
# Write your sql query below

query = """
        SELECT TRIM(P.Name) AS ACTOR_NAME,  COUNT(DISTINCT M.MID) AS YASH_CHOPRA_DIRECTED_MOVIES
        FROM Person P 
        JOIN M_Cast MC ON TRIM(MC.PID) = P.PID 
        JOIN Movie M ON M.MID = MC.MID 
        JOIN M_Director MD ON MD.MID = M.MID 
        JOIN Person P1 ON P1.PID = TRIM(MD.PID)
        WHERE TRIM(P1.Name) = 'Yash Chopra'
        GROUP BY TRIM(P.PID)
        ORDER BY COUNT(DISTINCT M.MID) DESC
        """

q8 = pd.read_sql_query(query, conn)    
print(q8.shape)
q8.head()

(430, 2)
Wall time: 442 ms


Unnamed: 0,ACTOR_NAME,YASH_CHOPRA_DIRECTED_MOVIES
0,Jagdish Raj,11
1,Manmohan Krishna,10
2,Iftekhar,9
3,Madan Puri,8
4,Vikas Anand,8


## Q9 --- The Shahrukh number of an actor is the length of the shortest path between the actor and Shahrukh Khan in the "co-acting" graph. That is, Shahrukh Khan has Shahrukh number 0; all actors who acted in the same film as Shahrukh have Shahrukh number 1; all actors who acted in the same film as some actor with Shahrukh number 1 have Shahrukh number 2, etc. Return all actors whose Shahrukh number is 2.

In [13]:
%%time
# Write your sql query below

query = """
        WITH 

        Shahrukh AS (SELECT TRIM(P.PID) PID FROM Person P WHERE Trim(P.Name) like '%Shahrukh%'),
        Shahrukh_movies AS (SELECT DISTINCT TRIM(MC.MID) MID, S.PID FROM M_Cast MC JOIN Shahrukh S ON TRIM(MC.PID) = S.PID),
        Shahrukh_actors AS (SELECT DISTINCT TRIM(MC.PID) PID FROM M_Cast MC JOIN Shahrukh_movies SM ON TRIM(MC.MID) = SM.MID AND TRIM(MC.PID) <> SM.PID),
        Shahrukh_movies_2 AS (SELECT DISTINCT TRIM(MC.MID) MID, SA.PID FROM M_Cast MC JOIN Shahrukh_actors SA ON TRIM(MC.PID) = SA.PID)

        SELECT DISTINCT TRIM(MC.PID) PID, TRIM(P.Name) ACTOR_NAME
        FROM Person P
        JOIN M_Cast MC ON TRIM(MC.PID) = TRIM(P.PID)
        JOIN Shahrukh_movies_2 SM2 ON  TRIM(MC.MID) = SM2.MID AND TRIM(MC.PID) <> SM2.PID
        LIMIT 50;
        """

q9 = pd.read_sql_query(query, conn)    
print(q9.shape)
q9.head()

(50, 2)
Wall time: 16.7 s


Unnamed: 0,PID,ACTOR_NAME
0,nm2951768,Freida Pinto
1,nm6467532,Caroline Christl Long
2,nm6071249,Rajeev Pahuja
3,nm3491108,Michelle Santiago
4,nm7509518,Jandre le Roux
