# **1. Data Collection**

**1.1- Importing File Formats**

In [1]:
import pandas as pd

# Importing CSV file
df = pd.read_csv('/kaggle/input/movies/movies_data.csv', encoding='latin-1')

**1.21- Checking Data Types**

In [2]:
data_types = df.dtypes
print(data_types)

Movie                                   object
Director                                object
Running time                             int64
Actor 1                                 object
Actor 2                                 object
Actor 3                                 object
Genre                                   object
Budget                                   int64
Box Office                               int64
Actors Box Office %                    float64
Director Box Office %                  float64
Earnings                                 int64
Oscar and Golden Globes nominations      int64
Oscar and Golden Globes awards         float64
Release year                             int64
IMDb score                             float64
dtype: object


**1.22- Checking Duplicates**

In [3]:
duplicates = df.duplicated().sum()
print(f'Total duplicates: {duplicates}')

Total duplicates: 24


# **2. Data Exploration**

**2.1- Understanding the Structure**

**2.11- View the First Few Rows**

In [4]:
df.head()

Unnamed: 0,Movie,Director,Running time,Actor 1,Actor 2,Actor 3,Genre,Budget,Box Office,Actors Box Office %,Director Box Office %,Earnings,Oscar and Golden Globes nominations,Oscar and Golden Globes awards,Release year,IMDb score
0,13 Hours,Michael Bay,144,Toby Stephens,James Badge Dale,David Costabile,Action,50000000,69400000,50.0,69.23,19400000,0,0.0,2016,7.4
1,16 Blocks,Richard Donner,102,Bruce Willis,David Zayas,Sasha Roiz,Action,52000000,65000000,33.33,50.0,13000000,0,0.0,2006,6.6
2,17 Again,Burr Steers,102,Matthew Perry,Hunter Parrish,Thomas Lennon,Comedy,20000000,139000000,44.44,25.0,119000000,0,0.0,2009,6.4
3,1982,Tommy Oliver,90,Bokeem Woodbine,Ruby Dee,Quinton Aaron,Drama,1000000,2000000,50.0,100.0,1000000,0,0.0,2013,7.1
4,2 Fast 2 Furious,John Singleton,107,Paul Walker,Cole Hauser,Mo Gallini,Action,76000000,236000000,75.0,42.85,160000000,0,0.0,2003,5.9


**2.12- View DataFrame Information**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3974 entries, 0 to 3973
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Movie                                3974 non-null   object 
 1   Director                             3974 non-null   object 
 2   Running time                         3974 non-null   int64  
 3   Actor 1                              3974 non-null   object 
 4   Actor 2                              3974 non-null   object 
 5   Actor 3                              3972 non-null   object 
 6   Genre                                3974 non-null   object 
 7   Budget                               3974 non-null   int64  
 8   Box Office                           3974 non-null   int64  
 9   Actors Box Office %                  3974 non-null   float64
 10  Director Box Office %                3974 non-null   float64
 11  Earnings                      

**2.13- Summary Statistics**

In [6]:
df.describe()

Unnamed: 0,Running time,Budget,Box Office,Actors Box Office %,Director Box Office %,Earnings,Oscar and Golden Globes nominations,Oscar and Golden Globes awards,Release year,IMDb score
count,3974.0,3974.0,3974.0,3974.0,3974.0,3974.0,3974.0,3971.0,3974.0,3974.0
mean,109.967036,36906390.0,108677000.0,2984678000.0,52.780695,71770600.0,1.106694,0.32133,2002.043785,6.467866
std,22.507658,42704310.0,179835400.0,134468400000.0,35.360199,152657300.0,2.724019,1.209324,12.127027,1.072514
min,20.0,1100.0,50000.0,0.0,0.0,-323100000.0,0.0,0.0,1929.0,1.6
25%,95.0,9000000.0,12000000.0,33.33,25.0,0.0,0.0,0.0,1998.0,5.9
50%,106.0,22000000.0,43000000.0,57.14,50.0,18500000.0,0.0,0.0,2005.0,6.6
75%,120.0,50000000.0,125000000.0,83.33,80.0,81000000.0,1.0,0.0,2010.0,7.2
max,330.0,390000000.0,2923000000.0,6805556000000.0,100.0,2686000000.0,22.0,13.0,2016.0,9.3


**2.14- View Column Names**

In [7]:
column_names = df.columns
print(column_names)

Index(['Movie', 'Director', 'Running time', 'Actor 1', 'Actor 2', 'Actor 3',
       'Genre', 'Budget', 'Box Office', 'Actors Box Office %',
       'Director Box Office %', 'Earnings',
       'Oscar and Golden Globes nominations', 'Oscar and Golden Globes awards',
       'Release year', 'IMDb score'],
      dtype='object')


**2.15- Shape of the Data**

In [8]:
shape = df.shape
print(f'Shape of the DataFrame: {shape}')

Shape of the DataFrame: (3974, 16)


**2.2- Missing Values**

**2.21- Identifying Missing Values**

In [9]:
missing_values = df.isnull().sum()
print(missing_values)

Movie                                  0
Director                               0
Running time                           0
Actor 1                                0
Actor 2                                0
Actor 3                                2
Genre                                  0
Budget                                 0
Box Office                             0
Actors Box Office %                    0
Director Box Office %                  0
Earnings                               0
Oscar and Golden Globes nominations    0
Oscar and Golden Globes awards         3
Release year                           0
IMDb score                             0
dtype: int64


**2.22- Dropping Rows or Columns with Missing Values**

In [10]:
# Drop rows with any missing values
df_cleaned = df.dropna()

# Drop columns with any missing values
df_cleaned = df.dropna(axis=1)

# **3. Data Cleaning**

**3.1- Identifying Outliers Using Z-Score**

In [11]:
from scipy import stats

# Calculate Z-scores of each value in the DataFrame
z_scores = stats.zscore(df.select_dtypes(include=['float64', 'int64']))

# Identify outliers (Z-score > 3 or < -3)
df_outliers = df[(z_scores > 3).any(axis=1) | (z_scores < -3).any(axis=1)]

**3.2- Remove Outliers**

In [12]:
# Remove rows with outliers based on Z-score
df_no_outliers = df[(z_scores < 3).all(axis=1) & (z_scores > -3).all(axis=1)]

# **Movies Analytics**

**How Does the Budget of a Movie Influence Its Box Office Performance?**

In [13]:
import plotly.express as px

fig = px.scatter(df, x='Budget', y='Box Office', color='Genre', 
                 title='How Does the Budget of a Movie Influence Its Box Office Performance?', 
                 labels={'Budget': 'Budget (in $)', 'Box Office': 'Box Office Earnings (in $)'})
fig.show()

**Which Directors Have the Highest Average Box Office Percentages?**

In [14]:
avg_box_office_by_director = df.groupby('Director')['Director Box Office %'].mean().reset_index()
fig = px.bar(avg_box_office_by_director, x='Director', y='Director Box Office %', 
              title='Which Directors Have the Highest Average Box Office Percentages?', 
              labels={'Director Box Office %': 'Average Box Office %'}, 
              color='Director Box Office %', 
              color_continuous_scale=px.colors.sequential.Viridis)
fig.show()

**How Does the Number of Oscars and Golden Globes Nominations Correlate with IMDb Scores?**

In [15]:
fig = px.scatter(df, x='Oscar and Golden Globes nominations', y='IMDb score', 
                 color='Genre', 
                 title='How Does the Number of Oscars and Golden Globes Nominations Correlate with IMDb Scores?', 
                 labels={'Oscar and Golden Globes nominations': 'Oscars and Golden Globes Nominations'})
fig.show()

**How Do Box Office Earnings Vary Across Different Genres?**

In [16]:
# Calculate the total box office earnings by genre
total_box_office_by_genre = df.groupby('Genre')['Box Office'].sum().reset_index()

fig = px.bar(total_box_office_by_genre, x='Genre', y='Box Office', 
             title='How Do Box Office Earnings Vary Across Different Genres?', 
             labels={'Box Office': 'Total Box Office Earnings (in $)'}, 
             color='Box Office', 
             color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

**What Is the Relationship Between Running Time and IMDb Score?**

In [17]:
fig = px.scatter(df, x='Running time', y='IMDb score', 
                 color='Genre', 
                 title='What Is the Relationship Between Running Time and IMDb Score?', 
                 labels={'Running time': 'Running Time (in minutes)', 'IMDb score': 'IMDb Score'})
fig.show()

**Which Actors Have the Highest Average Earnings?**

In [18]:
# Calculate average earnings by Actor 1
avg_earnings_by_actor1 = df.groupby('Actor 1')['Earnings'].mean().reset_index()

fig = px.bar(avg_earnings_by_actor1, x='Actor 1', y='Earnings', 
              title='Which Actors Have the Highest Average Earnings?', 
              labels={'Earnings': 'Average Earnings (in $)'}, 
              color='Earnings', 
              color_continuous_scale=px.colors.sequential.Magma)
fig.show()

**How Do Director Box Office Percentages Compare by Genre?**

In [19]:
# Calculate the average Director Box Office % by Genre
avg_director_box_office_by_genre = df.groupby('Genre')['Director Box Office %'].mean().reset_index()

fig = px.bar(avg_director_box_office_by_genre, x='Genre', y='Director Box Office %', 
              title='How Do Director Box Office Percentages Compare by Genre?', 
              labels={'Director Box Office %': 'Average Director Box Office %'}, 
              color='Director Box Office %', 
              color_continuous_scale=px.colors.sequential.Cividis)
fig.show()

**How Does the Earnings of Movies Change Over the Years?**

In [20]:
# Calculate total earnings by release year
total_earnings_by_year = df.groupby('Release year')['Earnings'].sum().reset_index()

fig = px.scatter(total_earnings_by_year, x='Release year', y='Earnings', 
                 title='How Does the Earnings of Movies Change Over the Years?', 
                 labels={'Release year': 'Release Year', 'Earnings': 'Total Earnings (in $)'}, 
                 color='Earnings', 
                 color_continuous_scale=px.colors.sequential.Rainbow)
fig.show()

**What Is the Distribution of Oscar and Golden Globes Awards by Genre?**

In [21]:
# Calculate the total awards by genre
total_awards_by_genre = df.groupby('Genre')['Oscar and Golden Globes awards'].sum().reset_index()

fig = px.bar(total_awards_by_genre, x='Genre', y='Oscar and Golden Globes awards', 
              title='What Is the Distribution of Oscar and Golden Globes Awards by Genre?', 
              labels={'Oscar and Golden Globes awards': 'Total Awards'}, 
              color='Oscar and Golden Globes awards', 
              color_continuous_scale=px.colors.sequential.Bluyl)
fig.show()

**How Do Average Running Times Differ Among Movie Genres?**

In [22]:
# Calculate the average running time by genre
avg_running_time_by_genre = df.groupby('Genre')['Running time'].mean().reset_index()

fig = px.bar(avg_running_time_by_genre, x='Genre', y='Running time', 
              title='How Do Average Running Times Differ Among Movie Genres?', 
              labels={'Running time': 'Average Running Time (in minutes)'}, 
              color='Running time', 
              color_continuous_scale=px.colors.sequential.Greens)
fig.show()

**Which Movies Have the Highest Box Office Earnings?**

In [23]:
# Select the top 10 movies by box office earnings
top_10_movies = df.nlargest(10, 'Box Office')

fig = px.bar(top_10_movies, x='Movie', y='Box Office', 
              title='Which Movies Have the Highest Box Office Earnings?', 
              labels={'Box Office': 'Box Office Earnings (in $)'}, 
              color='Box Office', 
              color_continuous_scale=px.colors.sequential.Oranges)
fig.show()

**How Does IMDb Score Change with Increasing Budget?**

In [24]:
fig = px.scatter(df, x='Budget', y='IMDb score', 
                 title='How Does IMDb Score Change with Increasing Budget?', 
                 labels={'Budget': 'Budget (in $)', 'IMDb score': 'IMDb Score'},
                 trendline='ols',  # Add a trendline for better visualization
                 color='Genre')
fig.show()

**What Is the Average Number of Oscar and Golden Globes Awards by Director?**

In [25]:
# Calculate the average awards by director
avg_awards_by_director = df.groupby('Director')['Oscar and Golden Globes awards'].mean().reset_index()

fig = px.bar(avg_awards_by_director, x='Director', y='Oscar and Golden Globes awards', 
              title='What Is the Average Number of Oscar and Golden Globes Awards by Director?', 
              labels={'Oscar and Golden Globes awards': 'Average Awards'}, 
              color='Oscar and Golden Globes awards', 
              color_continuous_scale=px.colors.sequential.Purples)
fig.show()

**How Do Earnings Compare Across Different Release Years?**

In [26]:
# Calculate total earnings by release year
total_earnings_by_year = df.groupby('Release year')['Earnings'].sum().reset_index()

fig = px.scatter(total_earnings_by_year, x='Release year', y='Earnings', 
                 title='How Do Earnings Compare Across Different Release Years?', 
                 labels={'Release year': 'Release Year', 'Earnings': 'Total Earnings (in $)'}, 
                 color='Earnings', 
                 color_continuous_scale=px.colors.sequential.Turbo)
fig.show()

**Which Actors Have Contributed the Most to Box Office Earnings?**

In [27]:
# Calculate total box office earnings by Actor 1
total_box_office_by_actor1 = df.groupby('Actor 1')['Box Office'].sum().reset_index()

fig = px.bar(total_box_office_by_actor1, x='Actor 1', y='Box Office', 
              title='Which Actors Have Contributed the Most to Box Office Earnings?', 
              labels={'Box Office': 'Total Box Office Earnings (in $)'}, 
              color='Box Office', 
              color_continuous_scale=px.colors.sequential.Sunset)
fig.show()

**How Do Box Office Percentages Compare by Genre?**

In [28]:
# Calculate the average box office percentage by genre
avg_box_office_percent_by_genre = df.groupby('Genre')['Director Box Office %'].mean().reset_index()

fig = px.bar(avg_box_office_percent_by_genre, x='Genre', y='Director Box Office %', 
              title='How Do Box Office Percentages Compare by Genre?', 
              labels={'Director Box Office %': 'Average Box Office %'}, 
              color='Director Box Office %', 
              color_continuous_scale=px.colors.sequential.Magma)
fig.show()

**What Is the Distribution of IMDb Scores by Release Year?**

In [29]:
# Calculate average IMDb score by release year
avg_imdb_score_by_year = df.groupby('Release year')['IMDb score'].mean().reset_index()

fig = px.scatter(avg_imdb_score_by_year, x='Release year', y='IMDb score', 
                 title='What Is the Distribution of IMDb Scores by Release Year?', 
                 labels={'Release year': 'Release Year', 'IMDb score': 'Average IMDb Score'},
                 color='IMDb score', 
                 color_continuous_scale=px.colors.sequential.Rainbow)
fig.show()

**How Do Oscar and Golden Globes Nominations Affect Earnings?**

In [30]:
fig = px.scatter(df, x='Oscar and Golden Globes nominations', y='Earnings', 
                 title='How Do Oscar and Golden Globes Nominations Affect Earnings?', 
                 labels={'Oscar and Golden Globes nominations': 'Oscar and Golden Globes Nominations', 
                         'Earnings': 'Earnings (in $)'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.Viridis)
fig.show()

**Which Directors Have the Highest Average Box Office Earnings?**

In [31]:
# Calculate average box office earnings by director
avg_box_office_by_director = df.groupby('Director')['Box Office'].mean().reset_index()

fig = px.bar(avg_box_office_by_director, x='Director', y='Box Office', 
              title='Which Directors Have the Highest Average Box Office Earnings?', 
              labels={'Box Office': 'Average Box Office Earnings (in $)'}, 
              color='Box Office', 
              color_continuous_scale=px.colors.sequential.Blugrn)
fig.show()

**How Does the Number of Awards Correlate with IMDb Score?**

In [32]:
fig = px.scatter(df, x='Oscar and Golden Globes awards', y='IMDb score', 
                 title='How Does the Number of Awards Correlate with IMDb Score?', 
                 labels={'Oscar and Golden Globes awards': 'Oscar and Golden Globes Awards', 
                         'IMDb score': 'IMDb Score'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.RdBu)
fig.show()

**What Is the Trend of Box Office Earnings Over the Years?**

In [33]:
# Calculate total box office earnings by release year
total_box_office_by_year = df.groupby('Release year')['Box Office'].sum().reset_index()

fig = px.line(total_box_office_by_year, x='Release year', y='Box Office', 
               title='What Is the Trend of Box Office Earnings Over the Years?', 
               labels={'Release year': 'Release Year', 'Box Office': 'Total Box Office Earnings (in $)'},
               markers=True, 
               color_discrete_sequence=['#FF5733'])
fig.show()

**How Does the Budget Affect Box Office Earnings by Genre?**

In [34]:
fig = px.scatter(df, x='Budget', y='Box Office', 
                 title='How Does the Budget Affect Box Office Earnings by Genre?', 
                 labels={'Budget': 'Budget (in $)', 'Box Office': 'Box Office Earnings (in $)'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

**What Are the Average Earnings by Actor 1?**

In [35]:
# Calculate average earnings by Actor 1
avg_earnings_by_actor1 = df.groupby('Actor 1')['Earnings'].mean().reset_index()

fig = px.bar(avg_earnings_by_actor1, x='Actor 1', y='Earnings', 
              title='What Are the Average Earnings by Actor 1?', 
              labels={'Earnings': 'Average Earnings (in $)'}, 
              color='Earnings', 
              color_continuous_scale=px.colors.sequential.Sunset)
fig.show()

**How Do Director Box Office Percentages Compare with Earnings?**

In [36]:
fig = px.scatter(df, x='Director Box Office %', y='Earnings', 
                 title='How Do Director Box Office Percentages Compare with Earnings?', 
                 labels={'Director Box Office %': 'Director Box Office %', 
                         'Earnings': 'Earnings (in $)'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.Viridis)
fig.show()

**How Do Earnings Vary by Genre?**

In [37]:
# Calculate total earnings by genre
total_earnings_by_genre = df.groupby('Genre')['Earnings'].sum().reset_index()

fig = px.bar(total_earnings_by_genre, x='Genre', y='Earnings', 
              title='How Do Earnings Vary by Genre?', 
              labels={'Earnings': 'Total Earnings (in $)'}, 
              color='Earnings', 
              color_continuous_scale=px.colors.sequential.Inferno)
fig.show()

**What Is the Relationship Between Running Time and IMDb Score?**

In [38]:
fig = px.scatter(df, x='Running time', y='IMDb score', 
                 title='What Is the Relationship Between Running Time and IMDb Score?', 
                 labels={'Running time': 'Running Time (in minutes)', 
                         'IMDb score': 'IMDb Score'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.Magma)
fig.show()

**How Do Box Office Earnings Compare Across Different Directors?**

In [39]:
# Calculate total box office earnings by director
total_box_office_by_director = df.groupby('Director')['Box Office'].sum().reset_index()

fig = px.bar(total_box_office_by_director, x='Director', y='Box Office', 
              title='How Do Box Office Earnings Compare Across Different Directors?', 
              labels={'Box Office': 'Total Box Office Earnings (in $)'}, 
              color='Box Office', 
              color_continuous_scale=px.colors.sequential.RdBu)
fig.show()

**How Do Actor 3 Earnings Compare Across Different Genres?**

In [40]:
# Calculate total earnings by Actor 3 and Genre
total_earnings_by_actor3_genre = df.groupby(['Actor 3', 'Genre'])['Earnings'].sum().reset_index()

fig = px.bar(total_earnings_by_actor3_genre, x='Actor 3', y='Earnings', 
              color='Genre', 
              title='How Do Actor 3 Earnings Compare Across Different Genres?', 
              labels={'Earnings': 'Total Earnings (in $)'}, 
              color_discrete_sequence=px.colors.qualitative.Set1)
fig.show()

**What Is the Correlation Between Budget and Box Office Earnings?**

In [41]:
fig = px.scatter(df, x='Budget', y='Box Office', 
                 title='What Is the Correlation Between Budget and Box Office Earnings?', 
                 labels={'Budget': 'Budget (in $)', 
                         'Box Office': 'Box Office Earnings (in $)'}, 
                 color='Genre', 
                 color_continuous_scale=px.colors.sequential.Bluyl)
fig.show()

**What Are the Earnings of Movies by Release Year?**

In [42]:
# Calculate total earnings by release year
total_earnings_by_year = df.groupby('Release year')['Earnings'].sum().reset_index()

fig = px.bar(total_earnings_by_year, x='Release year', y='Earnings', 
              title='What Are the Earnings of Movies by Release Year?', 
              labels={'Earnings': 'Total Earnings (in $)'}, 
              color='Earnings')
fig.show()