In [6]:
from google.colab import files
uploaded = files.upload()

Saving Netflix.csv to Netflix.csv


In [52]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

netflix_data = pd.read_csv("Netflix.csv")

netflix_data.head()

Unnamed: 0,Title,Type,Genre,Release Year,Rating,Runtime (min),Country,Language,Director
0,Stranger Things,TV Show,"Drama, Fantasy",2016,TV-14,50,USA,English,The Duffer Bros
1,The Crown,TV Show,"Drama, Biography",2016,TV-MA,58,UK,English,Peter Morgan
2,The Irishman,Movie,"Crime, Drama",2019,R,209,USA,English,Martin Scorsese
3,The Witcher,TV Show,"Drama, Fantasy",2019,TV-MA,60,USA,English,Lauren Schmidt Hissrich
4,Bird Box,Movie,"Thriller, Horror",2018,R,124,USA,English,Susanne Bier


In [17]:
# Strip any unwanted spaces from column names
netflix_data.columns = netflix_data.columns.str.strip()

# Print the actual column names to check for typos or case sensitivity issues
print(netflix_data.columns)

# Convert 'Runtime (min)' to numeric format,
# ensuring the column name matches exactly with the printed output above
# Check if 'Runtime (min)' is in the columns, and if not, use the actual name
if 'Runtime (min)' in netflix_data.columns:
    netflix_data['Runtime (min)'] = netflix_data['Runtime (min)'].replace(',', '', regex=True).astype(float)
else:
    # If not found, find the closest match and use that instead
    closest_match = next((col for col in netflix_data.columns if 'Runtime (min)' in col), None)
    if closest_match:
        print(f"Using column '{closest_match}' instead of 'Runtime (min)'")
        netflix_data[closest_match] = netflix_data[closest_match].replace(',', '', regex=True).astype(float)
    else:
        print("Could not find a suitable column for 'Runtime (min)'")

# Display the first few rows to confirm
print(netflix_data[['Title', 'Runtime (min)']].head() if 'Runtime (min)' in netflix_data.columns else netflix_data.head())

Index(['Title', 'Type', 'Genre', 'Release Year', 'Rating', 'Runtime (min)',
       'Country', 'Language', 'Director'],
      dtype='object')
             Title  Runtime (min)
0  Stranger Things           50.0
1        The Crown           58.0
2     The Irishman          209.0
3      The Witcher           60.0
4         Bird Box          124.0


In [21]:
print(netflix_data.columns)


Index(['Title', 'Type', 'Genre', 'Release Year', 'Rating', 'Runtime (min)',
       'Country', 'Language', 'Director'],
      dtype='object')


In [42]:
# Count content by country
country_count = netflix_data['Country'].value_counts().head(10)

# Create the plot
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(
        x=country_count.index,
        y=country_count.values,
        marker_color='lightgreen'
    )
])

fig.update_layout(
    title='Top 10 Countries by Content Count',
    xaxis_title='Country',
    yaxis_title='Content Count',
    xaxis_tickangle=45,
    height=600,
    width=1000
)

fig.show()


In [43]:
# Count the content by Rating and Type (Movies vs TV Shows)
rating_type_count = netflix_data.groupby(['Rating', 'Type']).size().unstack()

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=rating_type_count.index,
        y=rating_type_count['Movie'],
        name='Movies',
        marker_color='lightcoral'
    ),
    go.Bar(
        x=rating_type_count.index,
        y=rating_type_count['TV Show'],
        name='TV Shows',
        marker_color='lightblue'
    )
])

fig.update_layout(
    title='Content Distribution by Rating and Type (Movie/TV Show)',
    xaxis_title='Rating',
    yaxis_title='Content Count',
    barmode='stack',
    height=600,
    width=1000
)

fig.show()


In [44]:
# Count content by director
director_count = netflix_data['Director'].value_counts().head(5)

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=director_count.index,
        y=director_count.values,
        marker_color='lightblue'
    )
])

fig.update_layout(
    title='Top 5 Directors with the Most Content on Netflix',
    xaxis_title='Director',
    yaxis_title='Content Count',
    xaxis_tickangle=45,
    height=600,
    width=1000
)

fig.show()


In [45]:
# Convert runtime to numeric and handle non-numeric values
netflix_data['Runtime (min)'] = pd.to_numeric(netflix_data['Runtime (min)'], errors='coerce')

# Calculate the average runtime by content type
average_runtime_by_type = netflix_data.groupby('Type')['Runtime (min)'].mean()

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=average_runtime_by_type.index,
        y=average_runtime_by_type.values,
        marker_color='lightgreen'
    )
])

fig.update_layout(
    title='Average Runtime by Content Type (Movie/TV Show)',
    xaxis_title='Content Type',
    yaxis_title='Average Runtime (min)',
    height=500,
    width=800
)

fig.show()


In [46]:
# Count content by language
language_count = netflix_data['Language'].value_counts().head(10)

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=language_count.index,
        y=language_count.values,
        marker_color='lightcoral'
    )
])

fig.update_layout(
    title='Top 10 Languages by Content Count',
    xaxis_title='Language',
    yaxis_title='Content Count',
    xaxis_tickangle=45,
    height=600,
    width=1000
)

fig.show()


In [47]:
# Count content by release year
release_year_count = netflix_data['Release Year'].value_counts().sort_index()

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=release_year_count.index,
        y=release_year_count.values,
        marker_color='skyblue'
    )
])

fig.update_layout(
    title='Content Releases Over Time by Year',
    xaxis_title='Release Year',
    yaxis_title='Content Count',
    height=500,
    width=800
)

fig.show()


In [48]:
# Count content types by genre
type_genre_count = netflix_data.groupby(['Genre', 'Type']).size().unstack()

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=type_genre_count.index,
        y=type_genre_count['Movie'],
        name='Movies',
        marker_color='lightcoral'
    ),
    go.Bar(
        x=type_genre_count.index,
        y=type_genre_count['TV Show'],
        name='TV Shows',
        marker_color='lightblue'
    )
])

fig.update_layout(
    title='Content Types (Movies/TV Shows) by Genre',
    xaxis_title='Genre',
    yaxis_title='Content Count',
    barmode='stack',
    height=600,
    width=1000
)

fig.show()


In [49]:
# Count content ratings by release year
rating_year_count = netflix_data.groupby(['Release Year', 'Rating']).size().unstack()

# Create the plot
fig = go.Figure()

for rating in rating_year_count.columns:
    fig.add_trace(
        go.Scatter(
            x=rating_year_count.index,
            y=rating_year_count[rating],
            mode='lines+markers',
            name=rating
        )
    )

fig.update_layout(
    title='Content Ratings Over Time',
    xaxis_title='Release Year',
    yaxis_title='Content Count',
    height=600,
    width=1000,
    legend_title='Rating'
)

fig.show()


In [50]:
# Calculate the average runtime by rating
average_runtime_by_rating = netflix_data.groupby('Rating')['Runtime (min)'].mean()

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=average_runtime_by_rating.index,
        y=average_runtime_by_rating.values,
        marker_color='orange'
    )
])

fig.update_layout(
    title='Average Runtime by Content Rating',
    xaxis_title='Rating',
    yaxis_title='Average Runtime (min)',
    height=500,
    width=800
)

fig.show()


In [51]:
# Count content by genre
genre_count = netflix_data['Genre'].value_counts().head(10)

# Create the plot
fig = go.Figure(data=[
    go.Bar(
        x=genre_count.index,
        y=genre_count.values,
        marker_color='lightblue'
    )
])

fig.update_layout(
    title='Top 10 Genres by Content Count',
    xaxis_title='Genre',
    yaxis_title='Content Count',
    xaxis_tickangle=45,
    height=600,
    width=1000
)

fig.show()
