In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import bokeh
import altair as alt
from ggplot import ggplot
import folium

# Load the dataset
df = pd.read_csv('/kaggle/input/stay-gold-btss-impact-with-yt-videos/bts_videos.csv')


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         506 non-null    object
 1   published_at  506 non-null    object
 2   duration      506 non-null    object
 3   view_count    506 non-null    object
dtypes: object(4)
memory usage: 15.9+ KB


In [3]:
df.describe()

Unnamed: 0,title,published_at,duration,view_count
count,506,506,506,506
unique,503,502,353,501
top,BTS (방탄소년단) 'Yet To Come (The Most Beautiful M...,2023-02-09T13:40:00Z,PT16S,No View Count
freq,2,2,19,3


**What is the distribution of video durations in the "Stay Gold: BTS Impact with YT Videos" dataset?**

In [4]:
# Plot the distribution of video durations
fig = px.histogram(df, x='duration', nbins=20, title='Distribution of Video Durations')
fig.update_layout(xaxis_title='Duration (seconds)', yaxis_title='Number of Videos')
fig.show()

**How does the view count vary across different BTS videos?**

In [5]:
# Plot the view count for each video
fig = px.bar(df, x='title', y='view_count', title='View Count Variation Across BTS Videos')

# Update layout with height and width
fig.update_layout(
    xaxis_title='Video Title',
    yaxis_title='View Count',
    xaxis=dict(tickangle=90, tickmode='linear'),
    height=1500,  # set the height of the plot
    width=10000    # set the width of the plot
)

fig.show()

**How does the view count correlate with the duration of BTS videos?**

In [6]:
# Plot a scatter plot to visualize the correlation between view count and video duration
fig = px.scatter(df, x='duration', y='view_count', title='Correlation Between View Count and Video Duration')
fig.update_layout(xaxis_title='Duration (seconds)', yaxis_title='View Count')
fig.show()

**Can we visualize the distribution of video durations with a box plot?**

In [7]:
# Plot a box plot to show the distribution of video durations
fig = px.box(df, y='duration', title='Distribution of Video Durations (Box Plot)')
fig.update_layout(yaxis_title='Duration (seconds)')
fig.show()


**What are the top 10 most viewed BTS videos?**

In [8]:
# Identify the top 10 most viewed videos and plot them
top_10_views = df.sort_values(by='view_count', ascending=False).head(10)
fig = px.bar(top_10_views, x='title', y='view_count', title='Top 10 Most Viewed BTS Videos')

# Update layout with height and width
fig.update_layout(
    xaxis_title='Video Title',
    yaxis_title='View Count',
    xaxis=dict(tickangle=45, tickmode='linear'),
    height=1000,  # set the height of the plot
    width=800    # set the width of the plot
)

fig.show()


**How is the engagement level distributed among the videos based on view count?**

In [9]:
# Plot a histogram to show the distribution of view counts
fig = px.histogram(df, x='view_count', nbins=20, title='Distribution of Video View Counts')
fig.update_layout(xaxis_title='View Count', yaxis_title='Number of Videos')
fig.show()


**Is there a correlation between video duration and the number of views for the top 50 videos?**

In [10]:
# Identify the top 50 videos by view count
top_50_views = df.sort_values(by='view_count', ascending=False).head(50)

# Plot a scatter plot to visualize the correlation between view count and video duration for the top 50 videos
fig = px.scatter(top_50_views, x='duration', y='view_count', title='Correlation Between View Count and Video Duration (Top 50 Videos)')
fig.update_layout(xaxis_title='Duration (seconds)', yaxis_title='View Count')
fig.show()


**How has the view count evolved over time for the top 20 most recently published videos?**

In [11]:
# Identify the top 20 most recently published videos
top_20_recent = df.sort_values(by='published_at', ascending=False).head(20)

# Plot a line chart to visualize the evolution of view count over time for the top 20 most recently published videos
fig = px.line(top_20_recent, x='published_at', y='view_count', title='View Count Evolution Over Time (Top 20 Recently Published Videos)')
fig.update_layout(xaxis_title='Publication Date', yaxis_title='View Count')
fig.show()


**Can we explore the relationship between video duration and view count through a 3D scatter plot?**

In [12]:
# Plot a 3D scatter plot to explore the relationship between video duration, view count, and publication date
fig = px.scatter_3d(df, x='duration', y='view_count', z='published_at',
                    title='3D Scatter Plot: Duration, View Count, and Publication Date')
fig.update_layout(scene=dict(xaxis_title='Duration (seconds)', yaxis_title='View Count', zaxis_title='Publication Date'))
fig.show()


**How do the top 10 most viewed videos compare in terms of duration?**

In [13]:
# Identify the top 10 most viewed videos and plot a bar chart to compare their durations
top_10_duration = df.sort_values(by='view_count', ascending=False).head(10)
fig = px.bar(top_10_duration, x='title', y='duration', title='Top 10 Most Viewed BTS Videos: Duration Comparison')
fig.update_layout(
    xaxis_title='Video Title',
    yaxis_title='Duration (seconds)',
    xaxis=dict(tickangle=45, tickmode='linear'),
    height=1000,  # Specify the height
    width=800    # Specify the width
)
fig.show()


**How has the average view count changed over different months?**

In [14]:

# Convert 'view_count' to numeric, coerce errors to NaN
df['view_count'] = pd.to_numeric(df['view_count'], errors='coerce')

# Convert 'published_at' to datetime format
df['published_at'] = pd.to_datetime(df['published_at'])

# Extract the month from the 'published_at' column
df['month'] = df['published_at'].dt.month_name()

# Calculate the average view count for each month and visualize the trend
average_view_count_monthly = df.groupby('month')['view_count'].mean().reset_index()

fig = px.bar(average_view_count_monthly, x='month', y='view_count', title='Average View Count Over Different Months')
fig.update_layout(xaxis_title='Month', yaxis_title='Average View Count')
fig.show()


**Can we visualize the cumulative view count of BTS videos over time?**

In [15]:
# Calculate the cumulative view count over time and plot a line chart
cumulative_view_count = df.sort_values(by='published_at').set_index('published_at')['view_count'].cumsum().reset_index()
fig = px.line(cumulative_view_count, x='published_at', y='view_count', title='Cumulative View Count of BTS Videos Over Time', height=500, width=800)
fig.update_layout(xaxis_title='Publication Date', yaxis_title='Cumulative View Count')
fig.show()


**Explore the distribution of view counts through a histogram, considering a logarithmic scale.**

In [16]:
# Plot a histogram with a logarithmic scale for view counts
fig = px.histogram(df, x='view_count', nbins=20, title='Distribution of Video View Counts (Log Scale)', height=500, width=800, log_y=True)
fig.update_layout(xaxis_title='View Count (log scale)', yaxis_title='Number of Videos')
fig.show()


**How has the average view count changed over different hours of the day?**

In [17]:
# Extract the hour of the day from the 'published_at' column
df['hour'] = df['published_at'].dt.hour

# Calculate the average view count for each hour and visualize the trend
average_view_count_hourly = df.groupby('hour')['view_count'].mean().reset_index()
fig = px.bar(average_view_count_hourly, x='hour', y='view_count', title='Average View Count Over Different Hours of the Day', height=500, width=800)
fig.update_layout(xaxis_title='Hour of the Day', yaxis_title='Average View Count')
fig.show()


**Can we explore the relationship between the day of the week and video duration?**

In [18]:
# Convert 'view_count' to numeric, coerce errors to NaN
df['view_count'] = pd.to_numeric(df['view_count'], errors='coerce')

# Convert 'published_at' to datetime format
df['published_at'] = pd.to_datetime(df['published_at'])

# Extract the day of the week from the 'published_at' column
df['day_of_week'] = df['published_at'].dt.day_name()

# Plot a scatter plot to explore the relationship between the day of the week and video duration
fig = px.scatter(df, x='day_of_week', y='duration', title='Relationship Between Day of the Week and Video Duration', height=500, width=800)
fig.update_layout(xaxis_title='Day of the Week', yaxis_title='Duration (seconds)')
fig.show()


**What is the distribution of view counts for videos published on different days of the week?**

In [19]:
# Plot a box plot to show the distribution of view counts based on the day of the week
fig = px.box(df, x='day_of_week', y='view_count', title='Distribution of View Counts Based on Day of the Week', height=500, width=800)
fig.update_layout(xaxis_title='Day of the Week', yaxis_title='View Count')
fig.show()
