In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
cd /content/drive/MyDrive/InfoViz/Project/

/content/drive/MyDrive/InfoViz/Project


In [71]:
import pandas as pd

# Replace 'path_to_file' with the actual file path
df = pd.read_csv('movie_metadata.csv')

# Display the first few rows
print(df.head())
print(df.columns)

   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0     178.0   
1  Color     Gore Verbinski                   302.0     169.0   
2  Color         Sam Mendes                   602.0     148.0   
3  Color  Christopher Nolan                   813.0     164.0   
4    NaN        Doug Walker                     NaN       NaN   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                  22000.0                 23000.0    Christian Bale   
4                    131.0                     NaN        Rob Walker   

   actor_1_facebook_likes        gross                           genres  ...  \
0                  1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi  ...   
1               

# **Design Task 1:**

**Domain Problem:** Users need to understand the global distribution of movie revenues.

**Task:** Show how movies perform in different countries and identify regions with high or
low revenue.

**Design Component:** A map visualization displaying budget and revenue by country helps
users identify regional differences in movie performance.


In [None]:
import pandas as pd
import plotly.express as px

# Load the CSV file
df = pd.read_csv('movie_metadata.csv')

# Filter and clean data: remove rows with missing 'gross', 'budget', or 'country'
df = df[['country', 'gross', 'budget']]

# Convert 'gross' and 'budget' to numeric values if needed
df['gross'] = pd.to_numeric(df['gross'], errors='coerce')
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')

# Aggregate by country: sum of gross and budget
country_data = df.groupby('country').sum().reset_index()

# Identify the highest and lowest revenue countries
max_revenue_country = country_data.loc[country_data['gross'].idxmax()]
min_revenue_country = country_data.loc[country_data['gross'].idxmin()]

# Create a choropleth map for gross revenue with budget as hover information
fig = px.choropleth(
    country_data,
    locations="country",  # Column with country names
    locationmode="country names",  # Match locations with country names
    color="gross",  # Data for coloring
    color_continuous_scale="RdYlGn",  # Red to Yellow to Green scale
    labels={'gross': 'Total Revenue (Gross)'},
    hover_data={'budget': True, 'gross': ':,.0f'}  # Show budget and formatted gross in hover
)

# Update layout: bold, centered title
fig.update_layout(
    title={
        'text': "Global Distribution of Movie Revenue",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'family': 'Arial', 'color': 'black', 'weight': 'bold'} # Change 'bold' to 'weight': 'bold'
    },
)
# Add annotations for highest and lowest revenue countries
fig.add_annotation(
    text=f"Highest Revenue: {max_revenue_country['country']} ({max_revenue_country['gross']:,.0f})",
    xref="paper", yref="paper",
    x=0.5, y=0,  # Position annotation above the map
    showarrow=False,
    font=dict(size=14, color="green")
)

fig.add_annotation(
    text=f"Lowest Revenue: {min_revenue_country['country']} ({min_revenue_country['gross']:,.0f})",
    xref="paper", yref="paper",
    x=0.5, y=-0.05,  # Position annotation slightly below the first one
    showarrow=False,
    font=dict(size=14, color="red")
)

# Show the map
fig.show()


# **Design Task 2:**

**Domain Problem:** Users need to understand the global distribution of movie revenues by
year.

**Task:** Show how movie performance varies by country and region each year.

**Design Component:** A map visualization with an animated year selector that allows users
to view budget and revenue by country, emphasizing global performance trends across
different years.


In [93]:
import pandas as pd
import plotly.express as px

# Load the CSV file
df = pd.read_csv('movie_metadata.csv')

# Filter and clean data: Keep rows with valid 'gross', 'budget', 'country', and 'title_year'
df = df[['country', 'gross', 'budget', 'title_year']]

# Convert to numeric values where necessary
df['gross'] = pd.to_numeric(df['gross'], errors='coerce')
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['title_year'] = pd.to_numeric(df['title_year'], errors='coerce')

# Aggregate by country and year: sum of gross and budget
country_year_data = df.groupby(['country', 'title_year']).sum().reset_index()

# Sort the data by 'title_year' to ensure the animation goes from low to high year
country_year_data = country_year_data.sort_values(by="title_year")

# Create the choropleth map with animation
fig = px.choropleth(
    country_year_data,
    locations="country",  # Column with country names
    locationmode="country names",  # Match locations with country names
    color="gross",  # Data for coloring
    animation_frame="title_year",  # Animate by year
    color_continuous_scale="RdYlGn",  # Red to Yellow to Green scale
    labels={'gross': 'Total Revenue (Gross)', 'title_year': 'Year'},
    hover_data={'title_year': True, 'budget': True, 'gross': ':,.0f'}  # Static hover data for now
)

# Function to create dynamic annotations for each year (minimum and maximum revenue)
def get_year_annotations(year_data, year):
    max_revenue = year_data.loc[year_data['gross'].idxmax()]
    min_revenue = year_data.loc[year_data['gross'].idxmin()]

    # Create annotations for the year
    annotations = [
        dict(
            xref="paper", yref="paper",
            x=0.5, y=-0.05,
            text=f"Highest Revenue ({year}): {max_revenue['country']} ({max_revenue['gross']:,.0f})",
            showarrow=False,
            font=dict(size=14, color="green"),
            visible=True
        ),
        dict(
            xref="paper", yref="paper",
            x=0.5, y=-0.1,
            text=f"Lowest Revenue ({year}): {min_revenue['country']} ({min_revenue['gross']:,.0f})",
            showarrow=False,
            font=dict(size=14, color="red"),
            visible=True
        )
    ]
    return annotations

# Add dynamic annotations for each year (minimum and maximum revenue)
frames = []

for year in sorted(country_year_data['title_year'].unique()):
    # Filter data for the specific year
    year_data = country_year_data[country_year_data['title_year'] == year]

    # Create the frame with year data and annotations
    frames.append(
        dict(
            data=[dict(type='choropleth', locations=year_data['country'], z=year_data['gross'])],
            layout=dict(
                annotations=get_year_annotations(year_data, year),
                title=f"Global Distribution of Movie Revenue Year: {year}",  # Update title with year
                hovermode='closest',
                geo=dict(showcoastlines=True, coastlinecolor="Black")
            ),
            name=str(year)  # This ensures we keep track of each frame by year
        )
    )

# Add frames to the figure
fig.frames = frames

# Update layout for animation controls and annotations
fig.update_layout(
    title={  # Default title, will be updated with each frame
        'text': "Global Distribution of Movie Revenue Over Time",
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'family': 'Arial', 'color': 'black'}
    },
    updatemenus=[{
        'type': 'buttons',
        'buttons': [
            {
                'label': 'Play',
                'method': 'animate',
                'args': [None, {
                    'frame': {'duration': 1000, 'redraw': True},
                    'fromcurrent': True  # Starts animation from the current frame if clicked
                }]
            },
            {
                'label': 'Pause',
                'method': 'animate',
                'args': [[None], {
                    'frame': {'duration': 0, 'redraw': False},  # Pause animation
                    'mode': 'immediate',
                    'transition': {'duration': 0}
                }]
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }]
)

# Show the map with the animation
fig.show()


# **Design Task 3:**

**Domain Problem:** Users want to compare IMDb scores of movies across countries.

**Task:** Allow users to compare movie ratings by country.

**Design Component:** A box plot compares IMDb scores by country, enabling users to
analyze regional rating differences.

In [None]:
import pandas as pd
import plotly.express as px

# Load the CSV file
df = pd.read_csv('movie_metadata.csv')

# Filter and clean data: Keep rows with valid 'imdb_score' and 'country'
df = df[['country', 'imdb_score']].dropna()

# Ensure 'imdb_score' is numeric
df['imdb_score'] = pd.to_numeric(df['imdb_score'], errors='coerce')

# Sort the countries alphabetically
sorted_countries = sorted(df['country'].dropna().unique())

# Create the box plot comparing IMDb scores by country
fig = px.box(
    df,
    x="country",  # Country on the x-axis
    y="imdb_score",  # IMDb score on the y-axis
    title="IMDb Scores by Country",  # Title of the plot
    labels={'imdb_score': 'IMDb Score', 'country': 'Country'},  # Axis labels
    color="country",  # Color by country (optional for better distinction)
    category_orders={"country": sorted_countries},  # Sort countries alphabetically
    notched=True  # Adds a notch to the box plot for better visualization
)

# Update layout for better presentation
fig.update_layout(
    title={
        'text': "IMDb Scores by Country",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 20, 'family': 'Arial', 'color': 'black'}
    },
    xaxis_title="Country",  # X-axis title
    yaxis_title="IMDb Score",  # Y-axis title
    showlegend=False,  # Hide the legend (as it's not necessary for box plot)
    xaxis=dict(
        tickangle=270  # Rotate the x-axis labels to be vertical (parallel to the y-axis)
    )
)

# Show the plot
fig.show()


# **Design Task 4:**

**Domain Problem:** Users need to compare IMDb scores across different genres.

**Task:** Allow users to compare ratings based on genre.

**Design Component:** A bar charts displaying IMDb scores across genres, helping users quickly
assess genre-based rating differences.

In [None]:
import pandas as pd
import plotly.express as px

# Load the dataset
data = pd.read_csv('movie_metadata.csv')

# Clean and preprocess
data['genres'] = data['genres'].fillna('Unknown').str.split('|')
data = data.explode('genres')
data = data.dropna(subset=['imdb_score'])

# Calculate average IMDb scores for each genre
genre_avg_scores = data.groupby('genres')['imdb_score'].mean().reset_index()

# Sort genres by average IMDb scores
genre_avg_scores = genre_avg_scores.sort_values(by='imdb_score', ascending=False)

# Create a bar chart
fig = px.bar(
    genre_avg_scores,
    x='genres',
    y='imdb_score',
    title='Average IMDb Scores Across Genres',
    labels={'genres': 'Genre', 'imdb_score': 'Average IMDb Score'},
    text='imdb_score'
)

# Customize the chart
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Average IMDb Score',
    xaxis=dict(tickangle=45),
    showlegend=False
)

fig.show()


# **Design Task 5:**

**Domain Problem:** Users want to see IMDb score trends over time.

**Task:** Track and visualize changes in IMDb scores over time.

**Design Component:** Line graphs show IMDb score trends, allowing users to identify
patterns in ratings over time.

In [None]:
import pandas as pd
import plotly.express as px

# Load and clean the dataset
data = pd.read_csv('movie_metadata.csv')
data = data.dropna(subset=['imdb_score', 'title_year', 'genres'])

# Split genres and expand the DataFrame
data['genres'] = data['genres'].str.split('|')
expanded_data = data.explode('genres')

# Calculate the average IMDb score by year and genre
trend_data = expanded_data.groupby(['title_year', 'genres'])['imdb_score'].mean().reset_index()
trend_data = trend_data.rename(columns={'title_year': 'Year', 'genres': 'Genre', 'imdb_score': 'Average IMDb Score'})

# Create the line chart
fig = px.line(
    trend_data,
    x='Year',
    y='Average IMDb Score',
    color='Genre',
    title='IMDb Score Trends Over Time by Genre',
    labels={'Year': 'Year', 'Average IMDb Score': 'Average IMDb Score', 'Genre': 'Genre'},
    template='plotly_white'
)

# Add the dropdown for filtering genres
fig.update_layout(
    updatemenus=[
        {
            "buttons": [
                {"label": "All Genres", "method": "update", "args": [{"visible": [True] * len(trend_data['Genre'].unique())}]},
            ] + [
                {"label": genre, "method": "update", "args": [{"visible": [g == genre for g in trend_data['Genre'].unique()]}]}
                for genre in trend_data['Genre'].unique()
            ],
            "direction": "down",
            "showactive": True,
        }
    ]
)

fig.show()
