In [2]:
import pandas as pd

data = {
    "Movie Title": ["Movie1", "Movie2", "Movie3", "Movie4"],
    "Release Year": [2020, 2019, 2018, 2021],
    "Profit": [1000000, 2000000, 1500000, 500000],
    "Producers": ["Producer1", "Producer2", "Producer1", "Producer3"],
    "Directors": ["Director1", "Director2", "Director1", "Director3"],
    "Actors": ["Actor1, Actor2", "Actor2, Actor3", "Actor1, Actor3", "Actor4, Actor5"],
    "Language": ["English", "Spanish", "French", "English"],
    "Genres": ["Action, Thriller", "Drama", "Comedy", "Horror"],
    "Budget": [500000, 700000, 600000, 300000],
    "Box Office": [1500000, 2700000, 2100000, 800000]
}

imdb_df = pd.DataFrame(data)
imdb_df.to_csv("imdb_dataset.csv", index=False)


In [3]:
import pandas as pd

imdb_df = pd.read_csv("imdb_dataset.csv")

print(imdb_df.info())
print(imdb_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Movie Title   4 non-null      object
 1   Release Year  4 non-null      int64 
 2   Profit        4 non-null      int64 
 3   Producers     4 non-null      object
 4   Directors     4 non-null      object
 5   Actors        4 non-null      object
 6   Language      4 non-null      object
 7   Genres        4 non-null      object
 8   Budget        4 non-null      int64 
 9   Box Office    4 non-null      int64 
dtypes: int64(4), object(6)
memory usage: 452.0+ bytes
None
  Movie Title  Release Year   Profit  Producers  Directors          Actors  \
0      Movie1          2020  1000000  Producer1  Director1  Actor1, Actor2   
1      Movie2          2019  2000000  Producer2  Director2  Actor2, Actor3   
2      Movie3          2018  1500000  Producer1  Director1  Actor1, Actor3   
3      Movie4          

In [4]:
highest_profit_movie = imdb_df.loc[imdb_df['Profit'].idxmax()]
print(f"Movie with the highest profit: {highest_profit_movie['Movie Title']}")
print(f"Producer(s): {highest_profit_movie['Producers']}")
print(f"Director(s): {highest_profit_movie['Directors']}")
print(f"Actors: {highest_profit_movie['Actors']}")


Movie with the highest profit: Movie2
Producer(s): Producer2
Director(s): Director2
Actors: Actor2, Actor3


In [5]:
imdb_df['ROI'] = (imdb_df['Profit'] / imdb_df['Budget']) * 100
highest_avg_roi_language = imdb_df.groupby('Language')['ROI'].mean().idxmax()
print(f"Language with the highest average ROI: {highest_avg_roi_language}")


Language with the highest average ROI: Spanish


In [6]:
unique_genres = set()
imdb_df['Genres'].str.split(', ').apply(unique_genres.update)
print(f"Unique genres: {unique_genres}")


Unique genres: {'Thriller', 'Comedy', 'Action', 'Drama', 'Horror'}


In [7]:
producers_directors_df = imdb_df[['Movie Title', 'Producers', 'Directors']]
print(producers_directors_df)

top_3_producers = imdb_df.groupby('Producers')['ROI'].mean().sort_values(ascending=False).head(3)
print(f"Top 3 producers with highest average ROI: {top_3_producers.index.tolist()}")


  Movie Title  Producers  Directors
0      Movie1  Producer1  Director1
1      Movie2  Producer2  Director2
2      Movie3  Producer1  Director1
3      Movie4  Producer3  Director3
Top 3 producers with highest average ROI: ['Producer2', 'Producer1', 'Producer3']


In [8]:
from collections import Counter

actor_counts = Counter()
imdb_df['Actors'].str.split(', ').apply(actor_counts.update)
most_frequent_actor = actor_counts.most_common(1)[0][0]
print(f"Actor with the most number of movies: {most_frequent_actor}")

# Deep dive into the movies, genres, and profits for this actor
actor_movies = imdb_df[imdb_df['Actors'].str.contains(most_frequent_actor)]
print(f"Movies with {most_frequent_actor}: {actor_movies['Movie Title'].tolist()}")
print(f"Genres: {actor_movies['Genres'].tolist()}")
print(f"Profits: {actor_movies['Profit'].tolist()}")


Actor with the most number of movies: Actor1
Movies with Actor1: ['Movie1', 'Movie3']
Genres: ['Action, Thriller', 'Comedy']
Profits: [1000000, 1500000]
