In [None]:
#Goals 
#1.top 5 most popular movies on IMDb (with most votes)
#2.find the highest grossing-movie(box office), producers, and directors with their best selling movies 


import numpy as np 
import pandas as pd
import datetime as dt
movie = pd.read_csv("../input/imdb-extensive-dataset/IMDb movies.csv", index_col= "imdb_title_id",
        usecols= ['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 
       'production_company', 'actors', 'votes',
       'budget', 'worlwide_gross_income'])

#there seems to be a typo in worlwide_gross_income, thus renaming it
movie.rename(columns = {"worlwide_gross_income": "worldwide_gross_income"}, inplace = True)


# The Top 5 Most Popular Movies on IMDb (sort by votes) 
![](https://www.filmsite.org/posters/shawshankredemption.jpg)

In [None]:
movie[["original_title","country","director","actors","votes","budget","worldwide_gross_income"]].nlargest(5,"votes")

The next goal is to find the Highest Grossing Movie of all time (dataset last updated in 2020). However,Top 5 Most Popular Movies list above shows that budget and worldwide_gross_income are strings,thus will need to convert them to numeric data type

# Data Cleaning to Find the Highest Grossing Movie

In [None]:
movie.dropna(subset = ["worldwide_gross_income"],inplace = True) #to drop all nulls worlwide_gross_income to make dataset smaller
movie["worldwide_gross_income"].str.split().str.get(0).unique() #looking into dollar sign of the columns and find that there are some other currencies as well

In [None]:
#function to convert to dollar sign from currencies list above 
def convert_to_dollar(box_office):
    sign = box_office.split()[0]
    money = box_office.split()[1]
    if sign == "$":
        return float(money)
    if sign == "GBP":
        return float(money) * 1.39
    if sign == "INR":
        return float(money) * 0.014
    if sign == "PKR":
        return float(money) * 0.0064 
movie["worldwide_gross_income"] = movie["worldwide_gross_income"].apply(convert_to_dollar)

# **Top 5 Highest Gross Income Movies (last updated in 2020)**
![](https://writinguntilragnarokhome.files.wordpress.com/2021/01/avengers-endgame-review.jpg)

In [None]:
highest_producer = movie[["title","date_published","country","production_company","worldwide_gross_income"]
     ].nlargest(5,"worldwide_gross_income")
highest_producer["billions"] = highest_producer["worldwide_gross_income"]/1000000000
highest_producer.style.bar(subset=["worldwide_gross_income"], color = "#ffffff00.")
#using style.bar with transparent color to surpress scientific notation in worldwide_gross_income column 

# **Top 10 Highest Gross Income Production Company (last updated in 2020)**
![](https://www.denofgeek.com/wp-content/uploads/2018/05/warner_bros_logo.jpg?resize=768%2C432)

In [None]:
a = movie.pivot_table(values = "worldwide_gross_income", index = "production_company", 
                  aggfunc="sum").sort_values("worldwide_gross_income", ascending = False).nlargest(10,"worldwide_gross_income")
a["billions"] = a["worldwide_gross_income"]/1000000000
a.style.bar(subset=["billions"])

# Data Cleaning to Find the Highest Grossing Directors
Some movies have more than 1 director, thus will be splitting directors without splitting gross income to get a list of directors with the highest box office

In [None]:
directors = movie["director"].str.split(", ", expand = True).merge(movie["worldwide_gross_income"], how = "inner", on = "imdb_title_id")
#This is to split director column and merge with gross income column. It seems that in this dataset, a movie has a maximun of 2 directors
directors.head()

In [None]:
#preparing to split the table above into 2 , then union them by pd.concat()
directors_a = directors[[0,"worldwide_gross_income"]]
directors_a.rename(columns = {0:"director"}, inplace = True)
directors_b = directors[[1,"worldwide_gross_income"]]
directors_b.rename(columns = {1:"director"}, inplace = True)
split_directors = pd.concat(objs = [directors_a,directors_b]).dropna(subset = ["director"])
highest_directors = split_directors.pivot_table(values = "worldwide_gross_income", index = "director", 
                            aggfunc = "sum").nlargest(10,"worldwide_gross_income")

In [None]:
highest_directors["billions"] = highest_directors["worldwide_gross_income"]/1000000000
highest_directors["billions"]
highest_directors 

In [None]:
#Russo Brothers are being seperated althought they seem to director all movies together, thus, putting them back together
highest_directors.rename(mapper = {"Anthony Russo":"Anthony Russo, Joe Russo", "Joe Russo":"Anthony Russo, Joe Russo"}, 
                        inplace = True)

highest_directors.drop_duplicates(subset = ["worldwide_gross_income"],inplace = True)

In [None]:
#the new list has only 9 row of directors, thus will add the 10th highest grossing director to the end of the list 
#find the new 10th highest grossing director, add it to the top 10 list
tenth_highest_director = split_directors.pivot_table(values = "worldwide_gross_income", index = "director", 
                            aggfunc = "sum").nlargest(11,"worldwide_gross_income").iloc[[10]]
tenth_highest_director["billions"] = tenth_highest_director.iloc[0,0]/1000000000
highest_directors = pd.concat(objs=[highest_directors,tenth_highest_director])

In [None]:
#Now, this is the new list of highest grossing directors, and I want to find their best selling movies as well 
highest_directors.index

In [None]:
#extra movie data from top 10 grossing directors, group movies by director
director_group = movie[movie["director"].isin(list(highest_directors.index))].groupby("director")
#create an empty dataframe
df = pd.DataFrame()

In [None]:
#Find the highest grossing movie from each director and add it to the empty dataframe created above 
for director,data in director_group:
    highest_income_movie_of_the_director = data.nlargest(1,"worldwide_gross_income")
    df = df.append(highest_income_movie_of_the_director)

# Top 10 Highest Grossing Directors with Their Best Selling Movies
![](https://cdn.britannica.com/95/176995-050-609666E2/Steven-Spielberg-2013.jpg)![](https://i.pinimg.com/originals/da/d2/50/dad2508a4e1785cbe910b90dbf7856a5.jpg)

In [None]:
#merge the table created above to the highest grossing director table 
temp = highest_directors.merge(df[["director","original_title","year","worldwide_gross_income"]].set_index(keys = "director"), left_index = True, right_index = True).rename(
                    columns = {"worldwide_gross_income_x": "total gross income",
                              "original_title": "best selling movie",
                              "worldwide_gross_income_y": "best selling movie's total gross income"})
temp["percentage of total gross income"] = temp["best selling movie's total gross income"]/temp["total gross income"]
#curious to see how many movie each director has directed 
temp.merge(pd.DataFrame(director_group.size()),left_index = True,right_index = True).rename(
    columns = {0:"number of movies directed"}).style.bar(subset=["total gross income"],color = "#ffffff00.")
