In [1]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/netflix-engagement-report/What_We_Watched_A_Netflix_Engagement_Report_2023Jan-Jun.csv'
netflix_df = pd.read_csv(file_path)

# Display basic information and statistics about the dataset
dataset_info = netflix_df.info()
dataset_describe = netflix_df.describe()

print(dataset_info)
print(dataset_describe)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18220 entries, 0 to 18219
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Title                18214 non-null  object 
 1   Available Globally?  18214 non-null  object 
 2   Release Date         4855 non-null   object 
 3   Hours Viewed         18214 non-null  object 
 4   Unnamed: 4           0 non-null      float64
 5   Unnamed: 5           0 non-null      float64
dtypes: float64(2), object(4)
memory usage: 854.2+ KB
None
       Unnamed: 4  Unnamed: 5
count         0.0         0.0
mean          NaN         NaN
std           NaN         NaN
min           NaN         NaN
25%           NaN         NaN
50%           NaN         NaN
75%           NaN         NaN
max           NaN         NaN


In [2]:
# Check for missing values in each column and handle them
missing_values = netflix_df.isnull().sum()
netflix_df_cleaned = netflix_df.dropna()

print("Missing Values:\n", missing_values)
print("\nDataset After Cleaning:\n", netflix_df_cleaned.head())

Missing Values:
 Title                      6
Available Globally?        6
Release Date           13365
Hours Viewed               6
Unnamed: 4             18220
Unnamed: 5             18220
dtype: int64

Dataset After Cleaning:
 Empty DataFrame
Columns: [Title, Available Globally?, Release Date, Hours Viewed, Unnamed: 4, Unnamed: 5]
Index: []


In [3]:
# Explore unique values in the "Available Globally?" column
unique_values_global = netflix_df['Available Globally?'].value_counts()

print("Unique Values in 'Available Globally?' column:\n", unique_values_global)


Unique Values in 'Available Globally?' column:
 Available Globally?
No     13700
Yes     4514
Name: count, dtype: int64


In [4]:
# Calculate total hours viewed for each title
total_hours_viewed = netflix_df.groupby('Title')['Hours Viewed'].sum()

print("Total Hours Viewed for Each Title:\n", total_hours_viewed)


Total Hours Viewed for Each Title:
 Title
#Alive // #살아있다                               10,700,000
#AnneFrank - Parallel Stories                    800,000
#AtFirstSight // #Sohavégetnemérös               200,000
#FriendButMarried // #TemanTapiMenikah           200,000
#FriendButMarried 2 // #TemanTapiMenikah 2       200,000
                                                 ...    
레드슈즈                                             200,000
비상선언                                          18,600,000
선생 김봉두                                           100,000
침묵                                               400,000
표적                                               200,000
Name: Hours Viewed, Length: 18214, dtype: object


**What is the distribution of hours viewed across all titles in the dataset?**

In [5]:
import plotly.express as px

# Create a histogram for hours viewed using Plotly
fig = px.histogram(netflix_df, x='Hours Viewed', nbins=20, title='Distribution of Hours Viewed Across Titles',
                   labels={'Hours Viewed': 'Hours Viewed', 'count': 'Frequency'})
fig.show()


**What is the proportion of titles available globally compared to those not available globally?**

In [6]:
# Calculate the proportion of titles available globally
global_proportion = netflix_df['Available Globally?'].value_counts(normalize=True)

# Create a pie chart for the proportion of titles available globally using Plotly
fig = px.pie(global_proportion, names=global_proportion.index, title='Proportion of Titles Available Globally',
             labels={'Available Globally?': 'Title Availability'}, color=global_proportion.index,
             color_discrete_map={'Yes': 'lightgreen', 'No': 'lightcoral'})
fig.show()


**Can we visualize the trend of hours viewed over time? Create a line plot to show the trend**

In [7]:
# Create a line plot for the trend of hours viewed over time using Plotly
fig = px.line(netflix_df, x='Release Date', y='Hours Viewed', markers=True, title='Trend of Hours Viewed Over Time',
              labels={'Hours Viewed': 'Hours Viewed', 'Release Date': 'Release Date'})
fig.show()


**How is the distribution of hours viewed for each title? Visualize this using box plots**

In [8]:
# Create box plots for the distribution of hours viewed for each title using Plotly
fig = px.box(netflix_df, x='Title', y='Hours Viewed', title='Distribution of Hours Viewed for Each Title',
             labels={'Hours Viewed': 'Hours Viewed', 'Title': 'Title'}, height=1000)
fig.update_xaxes(tickangle=90)
fig.show()


**What is the overall distribution of hours viewed for titles available globally compared to those not available globally?**

In [9]:
# Create overlapping histograms for hours viewed based on title availability using Plotly
fig = px.histogram(netflix_df, x='Hours Viewed', color='Available Globally?',
                   marginal='rug', title='Distribution of Hours Viewed by Title Availability',
                   labels={'Hours Viewed': 'Hours Viewed', 'Available Globally?': 'Title Availability'},
                   color_discrete_map={'Yes': 'lightgreen', 'No': 'lightcoral'})
fig.show()


**Can we visualize the top 10 titles with the highest total hours viewed?**

In [10]:
# Calculate total hours viewed for each title
total_hours_viewed = netflix_df.groupby('Title')['Hours Viewed'].sum().sort_values(ascending=False).head(10)

# Create a bar chart for the top 10 titles with the highest total hours viewed using Plotly
fig = px.bar(total_hours_viewed, x=total_hours_viewed.index, y='Hours Viewed',
             title='Top 10 Titles with Highest Total Hours Viewed',
             labels={'Hours Viewed': 'Total Hours Viewed', 'index': 'Title'},
             color=total_hours_viewed.index, height=1000, width=1500)
fig.update_xaxes(tickangle=90)
fig.show()


**How does the distribution of hours viewed vary across different release months?**

In [11]:
# Convert 'Release Date' column to datetime
netflix_df['Release Date'] = pd.to_datetime(netflix_df['Release Date'])

# Extract month from the 'Release Date' column
netflix_df['Release Month'] = netflix_df['Release Date'].dt.month_name()

# Create a violin plot for the distribution of hours viewed across release months using Plotly
fig = px.violin(netflix_df, x='Release Month', y='Hours Viewed', title='Distribution of Hours Viewed Across Release Months',
                labels={'Hours Viewed': 'Hours Viewed', 'Release Month': 'Release Month'})
fig.update_xaxes(tickangle=45)
fig.show()



**ow does the distribution of hours viewed vary between titles available globally and those not available globally?**

In [12]:
# Create a box plot for the distribution of hours viewed based on title availability using Plotly
fig = px.box(netflix_df, x='Available Globally?', y='Hours Viewed',
             title='Distribution of Hours Viewed Between Titles Available Globally and Not Available Globally',
             labels={'Hours Viewed': 'Hours Viewed', 'Available Globally?': 'Title Availability'},
             color='Available Globally?', color_discrete_map={'Yes': 'lightgreen', 'No': 'lightcoral'})
fig.show()


**What is the correlation between the release date and the hours viewed?**

In [13]:
# Convert 'Hours Viewed' column to numeric, handling commas
netflix_df['Hours Viewed'] = netflix_df['Hours Viewed'].replace({',': ''}, regex=True).astype(float)

# Calculate the correlation matrix
correlation_matrix = netflix_df[['Release Date', 'Hours Viewed']].corr()

# Create a heatmap for the correlation between release date and hours viewed using Plotly
fig = px.imshow(correlation_matrix, color_continuous_scale='viridis',
                title='Correlation Between Release Date and Hours Viewed')
fig.show()


**Can we visualize the distribution of hours viewed for titles available globally and those not available globally separately?**

In [14]:
# Create separate histograms for hours viewed based on title availability using Plotly
fig = px.histogram(netflix_df, x='Hours Viewed', color='Available Globally?',
                   barmode='overlay', nbins=20,
                   title='Distribution of Hours Viewed for Titles Available Globally and Not Available Globally',
                   labels={'Hours Viewed': 'Hours Viewed', 'Available Globally?': 'Title Availability'},
                   color_discrete_map={'Yes': 'lightgreen', 'No': 'lightcoral'})
fig.show()


**What is the distribution of release months for titles available globally and those not available globally?**

In [15]:
# Create separate bar charts for the distribution of release months based on title availability using Plotly
fig = px.bar(netflix_df, x='Release Month', color='Available Globally?', barmode='group',
             title='Distribution of Release Months for Titles Available Globally and Not Available Globally',
             labels={'Release Month': 'Release Month', 'Available Globally?': 'Title Availability'})
fig.update_xaxes(tickangle=45)
fig.show()


**How does the total hours viewed vary across different release months?**

In [16]:
# Calculate total hours viewed for each release month
total_hours_by_month = netflix_df.groupby('Release Month')['Hours Viewed'].sum().sort_index()

# Create a bar chart for the total hours viewed across different release months using Plotly
fig = px.bar(total_hours_by_month, x=total_hours_by_month.index, y='Hours Viewed',
             title='Total Hours Viewed Across Different Release Months',
             labels={'Hours Viewed': 'Total Hours Viewed', 'index': 'Release Month'})
fig.update_xaxes(tickangle=45)
fig.show()


**Can we visualize the top 5 titles with the highest total hours viewed?**

In [17]:
# Calculate total hours viewed for each title
total_hours_viewed = netflix_df.groupby('Title')['Hours Viewed'].sum().sort_values(ascending=False).head(5)

# Create a horizontal bar chart for the top 5 titles with the highest total hours viewed using Plotly
fig = px.bar(total_hours_viewed, y=total_hours_viewed.index, x='Hours Viewed',
             orientation='h', title='Top 5 Titles with Highest Total Hours Viewed',
             labels={'Hours Viewed': 'Total Hours Viewed', 'index': 'Title'})
fig.show()


**Can we visualize the trend of total hours viewed for titles available globally compared to those not available globally over time?**

In [18]:
# Calculate total hours viewed for each title and title availability
total_hours_by_date_and_global = netflix_df.groupby(['Release Date', 'Available Globally?'])['Hours Viewed'].sum().reset_index()

# Create a line plot for the trend of total hours viewed over time, distinguishing by title availability using Plotly
fig = px.line(total_hours_by_date_and_global, x='Release Date', y='Hours Viewed',
              color='Available Globally?', markers=True,
              title='Trend of Total Hours Viewed Over Time for Titles Available Globally and Not Available Globally',
              labels={'Hours Viewed': 'Total Hours Viewed', 'Release Date': 'Release Date'})
fig.update_xaxes(tickangle=45)
fig.show()


**What is the distribution of release months for the top 10 titles with the highest total hours viewed?**

In [19]:
# Get the top 10 titles with the highest total hours viewed
top10_titles = netflix_df.groupby('Title')['Hours Viewed'].sum().sort_values(ascending=False).head(10).index

# Create a bar chart for the distribution of release months for the top 10 titles using Plotly
fig = px.bar(netflix_df[netflix_df['Title'].isin(top10_titles)], x='Release Month',
             title='Distribution of Release Months for Top 10 Titles with Highest Total Hours Viewed',
             labels={'Release Month': 'Release Month', 'Title': 'Title'},
             color='Title', barmode='group')
fig.update_xaxes(tickangle=45)
fig.show()


**Can we visualize the trend of average hours viewed per title over time?**

**How does the distribution of hours viewed vary across different release months?**

In [20]:
# Create a scatter plot for the distribution of hours viewed across different release months using Plotly
fig = px.scatter(netflix_df, x='Release Month', y='Hours Viewed', color='Release Month',
                 title='Distribution of Hours Viewed Across Different Release Months',
                 labels={'Hours Viewed': 'Hours Viewed', 'Release Month': 'Release Month'})
fig.update_xaxes(tickangle=45)
fig.show()
