# Netflix Data Analysis

## 1. Introduction
This notebook analyzes the Netflix dataset to understand the content strategy, including the distribution of movies vs. TV shows, rating trends, and top producing countries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="darkgrid")
plt.style.use("dark_background")

## 2. Load Data

In [None]:
df = pd.read_csv('netflix1.csv')
df.head()

## 3. Data Cleaning

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Fill missing values
df['country'] = df['country'].fillna('Not Given')
df['director'] = df['director'].fillna('Not Given')
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])

# Check duplicates
df.duplicated().sum()

## 4. Feature Engineering

In [None]:
# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Extract year and month
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month_name()

## 5. Exploratory Data Analysis (EDA)

In [None]:
# 5.1 Movies vs TV Shows
plt.figure(figsize=(10, 6))
type_counts = df['type'].value_counts()
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', colors=['#e50914', '#221f1f'])
plt.title('Distribution of Movies vs TV Shows')
plt.show()

In [None]:
# 5.2 Content Added Over Years
plt.figure(figsize=(12, 6))
sns.countplot(x='year_added', data=df, hue='type', palette='viridis')
plt.title('Content Added Over the Years')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 5.3 Top 10 Countries
plt.figure(figsize=(12, 6))
top_countries = df['country'].value_counts().index[0:10]
sns.countplot(y='country', data=df, order=top_countries, palette='magma')
plt.title('Top 10 Countries Producing Content')
plt.show()

In [None]:
# 5.4 Rating Distribution
plt.figure(figsize=(14, 7))
order = df['rating'].value_counts().index
sns.countplot(x='rating', data=df, order=order, palette='coolwarm')
plt.title('Distribution of Ratings')
plt.show()

## 6. Advanced Analysis

In [None]:
# 6.1 Top Genres
plt.figure(figsize=(15, 8))
# Split genres since some titles have multiple listed
genres = df['listed_in'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
top_genres = genres.value_counts().head(20)
sns.barplot(y=top_genres.index, x=top_genres.values, palette='Reds_r')
plt.title('Top 20 Genres on Netflix')
plt.show()

In [None]:
# 6.2 Top Directors
plt.figure(figsize=(15, 8))
directors = df[df['director'] != 'Not Given']['director'].value_counts().head(20)
sns.barplot(y=directors.index, x=directors.values, palette='Blues_r')
plt.title('Top 20 Directors with Most Content')
plt.show()

In [None]:
# 6.3 Duration Analysis

# Clean duration column to extract numbers
df['duration_num'] = df['duration'].str.extract('(\d+)').astype(float)

# Separate Movies and TV Shows
movies = df[df['type'] == 'Movie']
tv_shows = df[df['type'] == 'TV Show']

# Movie Duration Histogram
plt.figure(figsize=(12, 6))
sns.histplot(movies['duration_num'], bins=30, kde=True, color='red')
plt.title('Distribution of Movie Duration (in minutes)')
plt.xlabel('Duration (minutes)')
plt.show()

In [None]:
# TV Show Seasons Count
plt.figure(figsize=(12, 6))
sns.countplot(x='duration', data=tv_shows, order=tv_shows['duration'].value_counts().index, palette='viridis')
plt.title('Distribution of TV Show Seasons')
plt.xticks(rotation=90)
plt.show()

In [None]:
# 6.4 Content Heatmap (Month vs Year)
monthly_content = df.groupby(['year_added', 'month_added']).size().unstack(fill_value=0)

# Order months correctly
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']
monthly_content = monthly_content.reindex(columns=month_order)

plt.figure(figsize=(15, 10))
sns.heatmap(monthly_content, cmap='YlGnBu', linewidths=.5)
plt.title('Density of Content Added by Month and Year')
plt.show()

## 7. Conclusions
- **Movies vs TV Shows**: Netflix has a significantly higher number of Movies compared to TV Shows.
- **Growth**: The amount of content added has increased significantly over the years, with a peak around 2018-2020.
- **Top Countries**: The United States is the leading producer of Netflix content, followed by India and the United Kingdom.
- **Ratings**: The majority of content is rated TV-MA (Mature Audience) and TV-14.
- **Genres**: International Movies, Dramas, and Comedies are the most popular genres.
- **Directors**: Rajiv Chilaka has the highest number of titles in this dataset (note: this may be skewed by children's content or specific regional datasets).
- **Duration**: Most movies are between 80 to 120 minutes long. Most TV shows have only 1 season, indicating a high turnover or limited series format.