In [None]:
# For this project we will create a movie dataset
import pandas as pd # we import pandas to work with structured data like tables
# creating a dictionary with movie data with diferent information of each movie
data = {
    "Movie name": ["Kill bill", "Titanic", "Avatar", "Interestellar", "Tangled"],
    "Director name": ["Quentin Tarantino", "James Cameron", "James Cameron", "Christopher Nolan", "Natham Greno"],
    "Release year": [2003, 1997, 2009, 2014, 2010],
    "Rating": [8.1, 7.8, 7.8, 8.6, 7.7],
    "Genre": ["Action", "Romance", "Sci-Fi", "Sci-Fi", "Animation"],
    "Budget": [30, 200, 237, 165, 260], # in millions
    "Box Office Revenue": [176, 2187, 2787, 675, 591] # in millions
}

# we convert the dictionary into DataFrame which is a table structure
df = pd.DataFrame(data)
# And then we display the DataFrame
print(df)

#display the first few rows
print(df.head())
# show the structure of the dataset
print(df.info())
# generate summary statistics for numerical columns
print(df.describe())

print("Average rating:", df['Rating'].mean())
print(df[df["Budget"] == df["Budget"].max()])
print(df[df["Box Office Revenue"] == df["Box Office Revenue"].max()])

# we filter movies where rating is greater than 8.0 and then we print
high_rated_movies = df[df["Rating"] > 8.0]
print(high_rated_movies)

# then, we filter movies where Box Office Revenue is greater than 500 million doing the same as the last point an then printing
high_grossing_movies = df[df["Box Office Revenue"] > 500]
print(high_grossing_movies)

# With this, the function will sort by year the movies we have at the movie dataset
# to ascending false means sort in descending order
sorted_by_year = df.sort_values(by="Release year", ascending=False)
print(sorted_by_year)

# this is the same as the last one but now with the Box Office Revenue
sorted_by_revenue_desc = df.sort_values(by="Box Office Revenue", ascending=False)
print(sorted_by_revenue_desc)

# we define the average rating and then print it
average_rating = df["Rating"].mean()
print("Average movie rating:", average_rating)

# add up all the values and store them on the dataframe and them print it
total_budget = df["Budget"].sum()
print("Total combined budget:", total_budget, "million")

# with .max() we can figure it out which movie has the highest gross or gain
highest_grossing_movie = df[df["Box Office Revenue"] == df["Box Office Revenue"].max()]
print("Highest-grossing movie:")
print(highest_grossing_movie)

# then with genre and average we can search by genre the average rating of the total amount of movies with x genre
genre_average_rating = df.groupby("Genre")["Rating"].mean()
print("Average rating per genre:")
print(genre_average_rating)

# To finish, as we did in the last function we calculate the total gains per director.
director_total_revenue = df.groupby("Director name")["Box Office Revenue"].sum()
print("Total box office revenue per director:")
print(director_total_revenue)

      Movie name      Director name  Release year  Rating      Genre  Budget  \
0      Kill bill  Quentin Tarantino          2003     8.1     Action      30   
1        Titanic      James Cameron          1997     7.8    Romance     200   
2         Avatar      James Cameron          2009     7.8     Sci-Fi     237   
3  Interestellar  Christopher Nolan          2014     8.6     Sci-Fi     165   
4        Tangled       Natham Greno          2010     7.7  Animation     260   

   Box Office Revenue  
0                 176  
1                2187  
2                2787  
3                 675  
4                 591  
      Movie name      Director name  Release year  Rating      Genre  Budget  \
0      Kill bill  Quentin Tarantino          2003     8.1     Action      30   
1        Titanic      James Cameron          1997     7.8    Romance     200   
2         Avatar      James Cameron          2009     7.8     Sci-Fi     237   
3  Interestellar  Christopher Nolan          2014     8