In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# Step 1: Connect to the SQLite database
# Connect to the 'movies.sqlite' database to fetch the movie data
conn = sqlite3.connect("movies.sqlite")
cur = conn.cursor()

In [None]:
# Step 2: Fetch the data we'll visualize
# Query to get the budget data for the movies (we'll use this for our visualizations)
query = "SELECT budget, revenue, popularity, release_date FROM movies WHERE budget IS NOT NULL AND revenue IS NOT NULL AND popularity IS NOT NULL;"
movie_data = pd.read_sql(query, conn)

In [None]:
# Step 3: Create a basic histogram using Matplotlib
# A simple histogram to visualize the distribution of movie budgets
plt.figure(figsize=(10, 6))
plt.hist(movie_data['budget'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Movie Budgets')
plt.xlabel('Budget ($)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

This histogram shows the distribution of movie budgets. We can see if most movies have low budgets or if there are some high-budget outliers.

In [None]:
# Step 4: Create a heatmap using Seaborn
# A heatmap to visualize the correlations between budget, revenue, and popularity
corr = movie_data.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=1)
plt.title('Correlation Heatmap of Movie Features')
plt.show()


The heatmap shows the correlation between budget, revenue, and popularity. We can identify if movies with higher budgets tend to have higher revenues or popularity.

In [None]:
# Step 5: Create an interactive scatter plot using Plotly
# We'll visualize the relationship between movie budgets and revenues, allowing the user to interact with the plot
fig = px.scatter(movie_data, x='budget', y='revenue',
                 title='Movie Budget vs. Revenue',
                 labels={'budget': 'Movie Budget ($)', 'revenue': 'Movie Revenue ($)'},
                 hover_data=['budget', 'revenue'])

fig.show()

This interactive scatter plot lets us explore the relationship between movie budgets and revenues. By hovering over the points, we can see individual data points and their values.


In [None]:
# Step 6: Explore additional insights (optional)

# For example, visualizing the number of movies released each year:

query_years = "SELECT strftime('%Y', release_date) AS release_year, COUNT(*) AS movie_count FROM movies GROUP BY release_year ORDER BY release_year;"
movies_per_year = pd.read_sql(query_years, conn)

plt.figure(figsize=(10, 6))
plt.plot(movies_per_year['release_year'], movies_per_year['movie_count'], marker='o', color='green')
plt.title('Number of Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()


This line plot shows the number of movies released each year. It gives us insight into how movie production has changed over time.


In [None]:
conn.close()