**Import libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile

** Extract CSV from ZIP **

In [None]:
with zipfile.ZipFile("netflix_titles.csv.zip", "r") as zip_ref:
    zip_ref.extractall()

**Load dataset**

In [None]:
df = pd.read_csv("netflix_titles.csv")

 **Basic Info**

In [None]:
print("Shape of dataset:", df.shape)
print("\n--- Column Info ---")
print(df.info())

** Missing Value Analysis**

In [None]:
print("\n--- Missing Values ---")
print(df.isnull().sum())

** Handle duplicates (if any)**

In [None]:
print("\nDuplicate Rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)


** Descriptive Statistics **

In [None]:
print("\n--- Summary Statistics ---")
print(df.describe(include='all'))

**Distribution of Content Type**

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='type', palette='viridis')
plt.title("Distribution of Movies vs TV Shows")
plt.show()

**Most Common Ratings**

In [None]:
plt.figure(figsize=(8,4))
df['rating'].value_counts().head(10).plot(kind='bar', color='skyblue')
plt.title("Top 10 Ratings on Netflix")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

** Most Common Countries**

In [None]:
plt.figure(figsize=(10,5))
df['country'].value_counts().head(10).plot(kind='bar', color='coral')
plt.title("Top 10 Countries Producing Netflix Content")
plt.xlabel("Country")
plt.ylabel("Number of Titles")
plt.show()

** Number of Titles Released Per Year**

In [None]:
plt.figure(figsize=(10,5))
df['release_year'].value_counts().sort_index(ascending=True).plot(kind='line', color='green')
plt.title("Number of Titles Released per Year")
plt.xlabel("Release Year")
plt.ylabel("Count")
plt.grid(True)
plt.show()

**Split Duration column (Movies vs TV Shows)**

In [None]:
movie_df = df[df['type'] == 'Movie']
tv_df = df[df['type'] == 'TV Show']

movie_df['duration_num'] = movie_df['duration'].str.replace(" min", "").astype(float)
tv_df['duration_num'] = tv_df['duration'].str.replace(" Season", "").str.replace("s","").astype(float)

plt.figure(figsize=(8,5))
sns.histplot(movie_df['duration_num'], bins=30, kde=True, color='orange')
plt.title("Distribution of Movie Durations (in minutes)")
plt.xlabel("Duration (min)")
plt.ylabel("Count")
plt.show()

**Top 10 Genres**

In [None]:
plt.figure(figsize=(10,6))
all_genres = df['listed_in'].dropna().str.split(', ').explode()
all_genres.value_counts().head(10).plot(kind='bar', color='purple')
plt.title("Top 10 Most Common Genres on Netflix")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.show()


 **Extract Year Added**

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year

plt.figure(figsize=(10,5))
df['year_added'].value_counts().sort_index().plot(kind='bar', color='teal')
plt.title("Number of Titles Added to Netflix Each Year")
plt.xlabel("Year Added")
plt.ylabel("Count")
plt.show()

**Correlation Heatmap (for numeric columns)**

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.select_dtypes(include=np.number).corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()