In [1]:
# Upload your Netflix dataset in Google Colab
from google.colab import files
import io
import pandas as pd

# Prompt the user to upload the CSV file
uploaded = files.upload()

# Get the uploaded file name
file_name = list(uploaded.keys())[0]

# Read the CSV into a pandas DataFrame
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Display the first few rows
print(f"\n✅ Successfully loaded: {file_name}\n")
df.head()


Saving Netflix Dataset.csv to Netflix Dataset.csv

✅ Successfully loaded: Netflix Dataset.csv



Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [11]:
# ==============================
# Netflix Dataset Analysis Project
# ==============================

# 📘 Project: Content Trends Analysis of Netflix Catalog (2008–2021)
# 📁 Dataset: Netflix Dataset.csv
# 👩‍💻 Author: Shivani Dubey

# ==============================
# 1️⃣ Install & Import Libraries
# ==============================

# Uncomment the following line if any library is missing
# !pip install plotly pandas matplotlib numpy --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
import os

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 300)

# ==============================
# 2️⃣ Load Dataset
# ==============================

from google.colab import files

print("If your file is not uploaded, use the upload box below ⬇️")
# uploaded = files.upload()   # Uncomment if running manually

DATA_PATH = "Netflix Dataset.csv"  # your uploaded file name

if not os.path.exists(DATA_PATH):
    print(f"❌ File '{DATA_PATH}' not found. Please upload it using the upload box above.")
else:
    df = pd.read_csv(DATA_PATH)
    print("✅ Dataset loaded successfully!")
    print("Shape:", df.shape)
    display(df.head())
    display(df.info())

# ==============================
# 3️⃣ Preprocessing
# ==============================

def parse_duration(x):
    """Convert '90 min' -> 90 or '1 Season' -> 1"""
    try:
        if pd.isna(x):
            return np.nan
        s = str(x).strip()
        if 'min' in s:
            return int(s.replace('min', '').strip())
        if 'Season' in s:
            return int(s.split()[0])
        return np.nan
    except:
        return np.nan

def explode_multivalue_column(df, col, sep=','):
    """Explode multivalued columns like genres or countries"""
    s = df[col].fillna('').astype(str).str.split(sep)
    s = s.apply(lambda lst: [i.strip() for i in lst if i.strip() != ''])
    return df.assign(**{col: s}).explode(col)

def preprocess(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    # Convert date_added to datetime
    if 'date_added' in df.columns:
        df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
        df['added_year'] = df['date_added'].dt.year
    else:
        df['added_year'] = np.nan

    # Convert release_year
    if 'release_year' in df.columns:
        df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

    # Parse duration
    if 'duration' in df.columns:
        df['duration_parsed'] = df['duration'].apply(parse_duration)

    df = df.replace({'': np.nan})
    return df

df = preprocess(df)
print("✅ Preprocessing completed.")
display(df.head())

# ==============================
# 4️⃣ Movies vs TV Shows Over Time
# ==============================

print("🎬 Type Distribution:")
display(df['Category'].value_counts())

if 'added_year' in df.columns and 'Category' in df.columns:
    yearly = df.groupby(['added_year', 'Category']).size().reset_index(name='count')
    fig = px.line(yearly, x='added_year', y='count', color='Category',
                  title="Movies vs TV Shows Over the Years", markers=True)
    fig.show()
else:
    print("⚠️ Columns 'added_year' or 'Category' not found. Please ensure 'date_added' and 'Category' exists in dataset.")

# ==============================
# 5️⃣ Genre Analysis
# ==============================

if 'Type' in df.columns:
    df_genre = explode_multivalue_column(df, 'Type', sep=',')
    top_genres = df_genre['Type'].value_counts().head(15)
    print("🎭 Top Genres:")
    display(top_genres)

    # Genre trends
    if 'added_year' in df_genre.columns:
        genre_year = df_genre.groupby(['added_year', 'Type']).size().reset_index(name='count')
        top10 = top_genres.index[:10].tolist()
        genre_year_top10 = genre_year[genre_year['Type'].isin(top10)]
        fig = px.area(genre_year_top10, x='added_year', y='count', color='Type',
                      title='Top 10 Genre Trends Over Time')
        fig.show()
else:
    print("⚠️ Column 'Type' (genres) not found.")

# ==============================
# 6️⃣ Country Analysis
# ==============================

if 'Country' in df.columns:
    df_country = explode_multivalue_column(df, 'Country', sep=',')
    top_countries = df_country['Country'].value_counts().head(20)
    print("🌍 Top Contributing Countries:")
    display(top_countries)

    top_countries_df = pd.DataFrame({'country': top_countries.index, 'count': top_countries.values})
    fig = px.bar(top_countries_df,
                 x='count', y='country', orientation='h', title='Top Countries by Number of Titles')
    fig.update_layout(yaxis={'categoryorder':'total ascending'})
    fig.show()
else:
    print("⚠️ Column 'Country' not found in dataset.")

# ==============================
# 7️⃣ Ratings vs Duration
# ==============================

if 'duration_parsed' in df.columns and 'Rating' in df.columns:
    fig = px.box(df.dropna(subset=['duration_parsed', 'Rating']),
                 x='Rating', y='duration_parsed',
                 title='Duration Distribution by Rating')
    fig.update_layout(xaxis_title='Rating', yaxis_title='Duration (minutes/seasons)')
    fig.show()
else:
    print("⚠️ Columns 'duration_parsed' or 'Rating' missing.")

# ==============================
# 8️⃣ Save Cleaned Dataset
# ==============================

output_file = "cleaned_netflix_dataset.csv"
df.to_csv(output_file, index=False)
print(f"✅ Cleaned dataset saved as '{output_file}'")

try:
    from google.colab import files
    files.download(output_file)
except:
    print("Download feature works only in Colab interactive mode.")

# ==============================
# 9️⃣ Optional: Genre Diversity Analysis
# ==============================

from math import log2

if 'Type' in df.columns and 'added_year' in df.columns:
    df_genre = explode_multivalue_column(df, 'Type')
    pivot = df_genre.groupby(['added_year','Type']).size().unstack(fill_value=0)
    def shannon_entropy(counts):
        total = counts.sum()
        p = counts / total
        return - (p * np.log2(p + 1e-12)).sum()

    diversity = pivot.apply(shannon_entropy, axis=1).reset_index(name='genre_diversity')
    fig = px.line(diversity, x='added_year', y='genre_diversity', title='Genre Diversity Over Years')
    fig.show()

print("🎯 Netflix Dataset Analysis Completed Successfully!")

If your file is not uploaded, use the upload box below ⬇️
✅ Dataset loaded successfully!
Shape: (7789, 11)


Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi & Fantasy","In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor."
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies","After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive."
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly, Christopher Plummer, Crispin Glover, Martin Landau, Fred Tatasciore, Alan Oppenheimer, Tom Kane",United States,"November 16, 2017",PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi & Fantasy","In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aaron Yoo, Liza Lapira, Jacob Pitts, Laurence Fishburne, Jack McGee, Josh Gad, Sam Golzari, Helen Carey, Jack Gilpin",United States,"January 1, 2020",PG-13,123 min,Dramas,A brilliant group of students become card-counting experts with the intent of swindling millions out of Las Vegas casinos by playing blackjack.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7789 entries, 0 to 7788
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Show_Id       7789 non-null   object
 1   Category      7789 non-null   object
 2   Title         7789 non-null   object
 3   Director      5401 non-null   object
 4   Cast          7071 non-null   object
 5   Country       7282 non-null   object
 6   Release_Date  7779 non-null   object
 7   Rating        7782 non-null   object
 8   Duration      7789 non-null   object
 9   Type          7789 non-null   object
 10  Description   7789 non-null   object
dtypes: object(11)
memory usage: 669.5+ KB


None

✅ Preprocessing completed.


Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description,added_year
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi & Fantasy","In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.",
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies","After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.",
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp.",
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly, Christopher Plummer, Crispin Glover, Martin Landau, Fred Tatasciore, Alan Oppenheimer, Tom Kane",United States,"November 16, 2017",PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi & Fantasy","In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group.",
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aaron Yoo, Liza Lapira, Jacob Pitts, Laurence Fishburne, Jack McGee, Josh Gad, Sam Golzari, Helen Carey, Jack Gilpin",United States,"January 1, 2020",PG-13,123 min,Dramas,A brilliant group of students become card-counting experts with the intent of swindling millions out of Las Vegas casinos by playing blackjack.,


🎬 Type Distribution:


Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Movie,5379
TV Show,2410


🎭 Top Genres:


Unnamed: 0_level_0,count
Type,Unnamed: 1_level_1
International Movies,2437
Dramas,2108
Comedies,1472
International TV Shows,1199
Documentaries,786
Action & Adventure,721
TV Dramas,704
Independent Movies,675
Children & Family Movies,532
Romantic Movies,531


🌍 Top Contributing Countries:


Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
United States,3298
India,990
United Kingdom,723
Canada,412
France,349
Japan,287
Spain,215
South Korea,212
Germany,199
Mexico,154


⚠️ Columns 'duration_parsed' or 'Rating' missing.
✅ Cleaned dataset saved as 'cleaned_netflix_dataset.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎯 Netflix Dataset Analysis Completed Successfully!


# Task
Analyze the provided Netflix dataset to identify trends in content additions over time. Load the data from "/tmp/netflix_titles.csv", then process it to extract the month and year of each content addition. Group the data by month and year, counting the number of movies and TV shows added in each period. Finally, visualize the monthly counts of movies and TV shows added to Netflix over time using a time series plot.

## Data loading

### Subtask:
Load the data from "/tmp/netflix_titles.csv" into a dataframe.


**Reasoning**:
The first step is to load the data from the specified CSV file into a pandas DataFrame.



In [12]:
import pandas as pd

df = pd.read_csv('Netflix Dataset.csv')

## Data wrangling

### Subtask:
Add a new column to the dataframe containing the month and year when each title was added to Netflix.


**Reasoning**:
Convert the 'Release_Date' column to datetime objects and then extract the month and year into a new column 'month_added'.



In [5]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce')
df['month_added'] = df['Release_Date'].dt.strftime('%B %Y')
df.head()

Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description,added_year,month_added
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi",Brazil,2020-08-14,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi & Fantasy","In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.",,August 2020
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato",Mexico,2016-12-23,TV-MA,93 min,"Dramas, International Movies","After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.",,December 2016
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim",Singapore,2018-12-20,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp.",,December 2018
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly, Christopher Plummer, Crispin Glover, Martin Landau, Fred Tatasciore, Alan Oppenheimer, Tom Kane",United States,2017-11-16,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi & Fantasy","In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group.",,November 2017
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aaron Yoo, Liza Lapira, Jacob Pitts, Laurence Fishburne, Jack McGee, Josh Gad, Sam Golzari, Helen Carey, Jack Gilpin",United States,2020-01-01,PG-13,123 min,Dramas,A brilliant group of students become card-counting experts with the intent of swindling millions out of Las Vegas casinos by playing blackjack.,,January 2020


## Data analysis

### Subtask:
Group the data by month and year, and count the number of movies and TV shows added in each month.


**Reasoning**:
Filter the DataFrame, group by month and category, count the occurrences, and reset the index.



In [6]:
monthly_counts = df[df['month_added'].notnull()].groupby(['month_added', 'Category']).size().reset_index(name='count')
monthly_counts.head()

Unnamed: 0,month_added,Category,count
0,April 2014,Movie,1
1,April 2014,TV Show,1
2,April 2015,Movie,1
3,April 2015,TV Show,4
4,April 2016,Movie,14


## Data visualization

### Subtask:
Visualize the data as a time series plot showing the monthly counts of movies and TV shows added to Netflix over time.


**Reasoning**:
Create a time series line plot to visualize the monthly content additions by category.



In [7]:
import plotly.express as px

fig = px.line(monthly_counts, x='month_added', y='count', color='Category',
              title="Monthly Content Additions to Netflix by Category")
fig.show()

**Reasoning**:
The plot is generated, but the x-axis is not ordered chronologically. Sort the dataframe by month and year before plotting.



In [8]:
monthly_counts['month_added'] = pd.to_datetime(monthly_counts['month_added'], format='%B %Y')
monthly_counts = monthly_counts.sort_values(by='month_added')

fig = px.line(monthly_counts, x='month_added', y='count', color='Category',
              title="Monthly Content Additions to Netflix by Category")
fig.show()

## Summary:

### Data Analysis Key Findings

*   A new column named `month_added` was successfully created in the DataFrame, containing the month and year each title was added in the format "Month Year".
*   The data was successfully grouped by `month_added` and `Category` (Movie or TV Show), resulting in a count of titles added for each category in each month.
*   The final visualization shows the monthly trend of content additions, clearly differentiating between movies and TV shows over time.

### Insights or Next Steps

*   The time series plot visually confirms trends in content additions, allowing for easy identification of periods with high or low activity for both movies and TV shows.
*   Further analysis could focus on identifying specific months or years with significant spikes or drops in content additions and investigating potential reasons for these trends (e.g., holiday releases, major content deals).
