In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your dataset with columns like 'Director', 'Genre', 'Revenue', etc.

# Load your data into a DataFrame (replace 'your_data.csv' with your actual file path)
df = pd.read_csv('/Users/aditya/Desktop/Class/MGMT635 DM&A/Finalterm/IMDB_9JEWELS.csv')

# Drop the default index column
df = df.drop(df.columns[0], axis=1)

df.head()

Unnamed: 0,Year,Opening Weekend Gross,US & Canada Gross,Worldwide Gross,Budget,awards,oscar wins,oscar nominated,Genre,Genre.1,Genre.2,Director 1,Director 2,Director 3
0,1994,727327.0,28767189.0,28884720.0,25000000.0,21,0,1,Drama,,,Frank Darabont,,
1,2008,158411483.0,534987076.0,1029266000.0,185000000.0,162,2,1,Action,Crime,Drama,Christopher Nolan,,
2,2003,72629713.0,379427292.0,1156035000.0,94000000.0,215,11,1,Action,Adventure,Drama,Peter Jackson,,
3,1994,9311882.0,107928762.0,213928800.0,8000000.0,69,1,1,Crime,Drama,,Quentin Tarantino,,
4,2001,47211490.0,316115420.0,898523700.0,93000000.0,125,4,1,Action,Adventure,Drama,Peter Jackson,,


In [2]:
import pandas as pd

# Assuming 'df' is your DataFrame

# Fill missing values in numerical columns with their mean
numerical_cols = ['Opening Weekend Gross', 'US & Canada Gross', 'Worldwide Gross', 'Budget']
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Fill missing values in award-related columns with 0
award_cols = ['awards', 'oscar wins', 'oscar nominated']
df[award_cols] = df[award_cols].fillna(0)

# Drop unnecessary columns (if needed)
df = df.drop(columns=[ 'Director 2', 'Director 3'])

# Display the cleaned DataFrame
df.head()

Unnamed: 0,Year,Opening Weekend Gross,US & Canada Gross,Worldwide Gross,Budget,awards,oscar wins,oscar nominated,Genre,Genre.1,Genre.2,Director 1
0,1994,727327.0,28767189.0,28884720.0,25000000.0,21,0,1,Drama,,,Frank Darabont
1,2008,158411483.0,534987076.0,1029266000.0,185000000.0,162,2,1,Action,Crime,Drama,Christopher Nolan
2,2003,72629713.0,379427292.0,1156035000.0,94000000.0,215,11,1,Action,Adventure,Drama,Peter Jackson
3,1994,9311882.0,107928762.0,213928800.0,8000000.0,69,1,1,Crime,Drama,,Quentin Tarantino
4,2001,47211490.0,316115420.0,898523700.0,93000000.0,125,4,1,Action,Adventure,Drama,Peter Jackson


In [3]:
# Melt the DataFrame to split rows with multiple genres
df_melted = df.melt(id_vars=df.columns.difference(['Genre', 'Genre.1', 'Genre.2']), value_vars=['Genre', 'Genre.1', 'Genre.2'], var_name='Genre_Source', value_name='GENRE')

# Drop the 'Genre_Source' column
df = df_melted.drop(columns=['Genre_Source'])

# Display the resulting DataFrame
df.head()

Unnamed: 0,Budget,Director 1,Opening Weekend Gross,US & Canada Gross,Worldwide Gross,Year,awards,oscar nominated,oscar wins,GENRE
0,25000000.0,Frank Darabont,727327.0,28767189.0,28884720.0,1994,21,1,0,Drama
1,185000000.0,Christopher Nolan,158411483.0,534987076.0,1029266000.0,2008,162,1,2,Action
2,94000000.0,Peter Jackson,72629713.0,379427292.0,1156035000.0,2003,215,1,11,Action
3,8000000.0,Quentin Tarantino,9311882.0,107928762.0,213928800.0,1994,69,1,1,Crime
4,93000000.0,Peter Jackson,47211490.0,316115420.0,898523700.0,2001,125,1,4,Action


In [13]:
import pandas as pd

# Assuming df_melted is the melted DataFrame with Single_Genre column
# You can use the original DataFrame if it already has the Single_Genre column

# Identify the 5 highest grossing directors in each genre
top_grossing_directors = df_melted.groupby('GENRE').apply(lambda x: x.nlargest(3, 'Worldwide Gross')).reset_index(drop=True)

# Identify the 5 directors with the highest recognition in each genre based on awards and Oscar wins
top_recognition_directors = df_melted.groupby('GENRE').apply(lambda x: x.nlargest(3, ['awards', 'oscar wins'])).reset_index(drop=True)

# Identify the 5 most respected directors in each genre based on Oscar wins
top_respected_directors = df_melted.groupby('GENRE').apply(lambda x: x.nlargest(3, 'oscar wins')).reset_index(drop=True)

# Display the results
print("Top 5 Highest Grossing Directors in Each Genre:")
print(top_grossing_directors[['GENRE', 'Director 1', 'Worldwide Gross']])

print("\nTop 5 Directors with Highest Recognition in Each Genre:")
print(top_recognition_directors[['GENRE', 'Director 1', 'awards', 'oscar wins']])

print("\nTop 5 Most Respected Directors in Each Genre:")
print(top_respected_directors[['GENRE', 'Director 1', 'oscar wins']])


Top 5 Highest Grossing Directors in Each Genre:
        GENRE         Director 1  Worldwide Gross
0      Action      James Cameron     2.923706e+09
1      Action      Anthony Russo     2.799439e+09
2      Action      Anthony Russo     2.052415e+09
3   Adventure      James Cameron     2.923706e+09
4   Adventure      Anthony Russo     2.799439e+09
..        ...                ...              ...
63        War         David Lean     1.119221e+08
64        War   Steven Spielberg     9.245574e+07
65    Western  Quentin Tarantino     4.260744e+08
66    Western         Mel Brooks     1.196173e+08
67    Western    Richard L. Bare     7.971239e+07

[68 rows x 3 columns]

Top 5 Directors with Highest Recognition in Each Genre:
        GENRE         Director 1  awards  oscar wins
0      Action      George Miller     245           6
1      Action      Peter Jackson     215          11
2      Action   Denis Villeneuve     173           6
3   Adventure      George Miller     245           6
4   Adv

In [14]:
# Save the top 5 highest grossing directors in each genre to an Excel file
top_grossing_directors[['GENRE', 'Director 1', 'Worldwide Gross']].to_excel('/Users/aditya/Desktop/Class/MGMT635 DM&A/Finalterm/top_grossing_directors.xlsx', index=False)

# Save the top 5 directors with the highest recognition in each genre to an Excel file
top_recognition_directors[['GENRE', 'Director 1', 'awards', 'oscar wins']].to_excel('/Users/aditya/Desktop/Class/MGMT635 DM&A/Finalterm/top_recognition_directors.xlsx', index=False)

# Save the top 5 most respected directors in each genre to an Excel file
top_respected_directors[['GENRE', 'Director 1', 'oscar wins']].to_excel('/Users/aditya/Desktop/Class/MGMT635 DM&A/Finalterm/top_respected_directors.xlsx', index=False)
