In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("movies.csv")
df.head()

## Cleaning of Dataset

- Analysing movies from recent years (2000-2020)
- Dropping rows with missing values (if any)
- Dissecting the "released" column to obtain release country
- Assigning bins to runtime to help with analysis

In [None]:
df = df[df["year"] >= 2000] # removing movies from pre-2000s
df = df.dropna() # dropping rows with missing values
# helper function to extract release country
def obtainCountry(string):
    idx1 = string.index("(")
    idx2 = string.index(")")
    return string[idx1 + 1: idx2]

df["releaseCountry"] = df["released"].apply(obtainCountry)

# helper function to categorise runtime into bins
def categoriseRuntime(runtime):
    if runtime < 60.0:
        return "< 60"
    elif runtime < 90.0:
        return "60 - 89"
    elif runtime < 120.0:
        return "90 - 119"
    elif runtime < 150.0:
        return "120 - 149"
    elif runtime < 180.0:
        return "150 - 179"
    elif runtime < 210.0:
        return "180 - 209"
    else:
        return ">= 210"

df["duration"] = df["runtime"].apply(categoriseRuntime)

df.head()

## Analysing the relationship between run-time and other factors to obtain optimal run-time

- with IMDb score
- with gross revenue

In [None]:
x = ["60 - 89", "90 - 119", "120 - 149", "150 - 179", "180 - 209", ">= 210"]

scores = []
for i in range(len(x)):
    filtered = df[df["duration"] == x[i]]
    scores.append(filtered["score"])

fig, ax = plt.subplots()
ax.set_xticklabels(x)
plt.boxplot(scores, showfliers=False)
plt.title("IMDb scores of movies by duration")
plt.xlabel("Duration(mins)")
plt.ylabel("IMDb Score")
plt.show()

In [None]:
x = ["60 - 89", "90 - 119", "120 - 149", "150 - 179", "180 - 209", ">= 210"]

gross = []
for i in range(len(x)):
    filtered = df[df["duration"] == x[i]]
    gross.append(filtered["gross"])

fig, ax = plt.subplots()
ax.set_xticklabels(x)
plt.boxplot(scores, showfliers=False)
plt.title("Gross revenue of movies by duration")
plt.xlabel("Duration(mins)")
plt.ylabel("Gross revenue(USD$)")
plt.show()

## Determining the best main actor/actress to engage for the movie

- by comparing number of appearances made in movies with IMDb scores >= 8.0
- by comparing mean revenue of all movies starred in

In [None]:
topscores = df[df["score"] >= 8.0]
top10stars_score = topscores[["star", "score"]].groupby("star").count().reset_index().nlargest(10, "score")
top10stars_score = top10stars_score.rename(columns={"score": "appearances"})
top10stars_score = top10stars_score.sort_values("appearances")

fig, ax = plt.subplots()
ax.barh(top10stars_score["star"], top10stars_score["appearances"])
plt.title("Top 10 stars generating high ratings for movies")
plt.xlabel("Number of appearances in highly rated movies")
plt.show()


In [None]:
top10stars_gross = df[["star", "gross"]].groupby("star").mean().reset_index().nlargest(10, "gross")
top10stars_gross = top10stars_gross.sort_values("gross")

fig, ax = plt.subplots()
ax.barh(top10stars_gross["star"], top10stars_gross["gross"])
plt.title("Top 10 revenue generating stars")
plt.xlabel("Mean revenue of movies starred in(USD$)")
plt.show()

## Determining the best director to engage for the movie

In [None]:
topscores = df[df["score"] >= 8.0]
top10directors_score = topscores[["director", "score"]].groupby("director").count().reset_index().nlargest(10, "score")
top10directors_score = top10directors_score.rename(columns={"score": "appearances"})
top10directors_score = top10directors_score.sort_values("appearances")

fig, ax = plt.subplots()
ax.barh(top10directors_score["director"], top10directors_score["appearances"])
plt.title("Top 10 directors generating high ratings for movies")
plt.xlabel("Number of highly rated movies directed")
plt.show()

In [None]:
top10directors_gross = df[["director", "gross"]].groupby("director").mean().reset_index().nlargest(10, "gross")
top10directors_gross = top10directors_gross.sort_values("gross")

fig, ax = plt.subplots()
ax.barh(top10directors_gross["director"], top10directors_gross["gross"])
plt.title("Top 10 revenue generating directors")
plt.xlabel("Mean revenue of movies directed(USD$)")
plt.show()

## Determining which country to produce the movie in

In [None]:
top10countries_gross = df[["releaseCountry", "gross"]].groupby("releaseCountry").mean().reset_index().nlargest(10, "gross")
top10countries_gross = top10countries_gross.sort_values("gross")

fig, ax = plt.subplots()
ax.barh(top10countries_gross["releaseCountry"], top10countries_gross["gross"])
plt.title("Top 10 release countries with highest revenue")
plt.xlabel("Mean revenue of movies released(USD$)")
plt.show()