In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Netflix Trends Analyzer\n",
    "Exploratory Data Analysis (EDA) on Netflix Titles Dataset."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# Set plot style\n",
    "sns.set(style='whitegrid')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df = pd.read_csv('netflix_titles.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Data Cleaning & Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Check for missing values\n",
    "df.info()\n",
    "df.isnull().sum()\n",
    "\n",
    "# Convert 'date_added' to datetime\n",
    "df['date_added'] = pd.to_datetime(df['date_added'])\n",
    "\n",
    "# Fill missing values if needed (example: fillna for 'country', 'director', 'cast')\n",
    "df['country'] = df['country'].fillna('Unknown')\n",
    "df['director'] = df['director'].fillna('Unknown')\n",
    "df['cast'] = df['cast'].fillna('Unknown')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "# Key EDA Questions\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. How has Netflixâ€™s content grown over the years?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Extract year from 'date_added'\n",
    "df['year_added'] = df['date_added'].dt.year\n",
    "content_per_year = df.groupby('year_added').size()\n",
    "plt.figure(figsize=(10,5))\n",
    "content_per_year.plot(kind='bar')\n",
    "plt.title(\"Netflix Content Growth Over Years\")\n",
    "plt.xlabel(\"Year Added\")\n",
    "plt.ylabel(\"Number of Titles\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Which countries contribute most content?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Split and count countries\n",
    "from collections import Counter\n",
    "country_counts = Counter()\n",
    "df['country'].dropna().apply(lambda x: country_counts.update([c.strip() for c in x.split(',')]))\n",
    "top_countries = country_counts.most_common(10)\n",
    "countries, counts = zip(*top_countries)\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.barplot(x=list(counts), y=list(countries))\n",
    "plt.title(\"Top 10 Countries by Content Count\")\n",
    "plt.xlabel(\"Number of Titles\")\n",
    "plt.ylabel(\"Country\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Top 10 most frequent genres?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "genre_counts = Counter()\n",
    "df['listed_in'].dropna().apply(lambda x: genre_counts.update([g.strip() for g in x.split(',')]))\n",
    "top_genres = genre_counts.most_common(10)\n",
    "genres, counts = zip(*top_genres)\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.barplot(x=list(counts), y=list(genres))\n",
    "plt.title(\"Top 10 Genres\")\n",
    "plt.xlabel(\"Number of Titles\")\n",
    "plt.ylabel(\"Genre\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Who are the most featured actors/directors?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Top 10 actors\n",
    "actor_counts = Counter()\n",
    "df['cast'].dropna().apply(lambda x: actor_counts.update([a.strip() for a in x.split(',')]))\n",
    "top_actors = actor_counts.most_common(10)\n",
    "actors, counts = zip(*top_actors)\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.barplot(x=list(counts), y=list(actors))\n",
    "plt.title(\"Top 10 Actors\")\n",
    "plt.xlabel(\"Number of Titles\")\n",
    "plt.ylabel(\"Actor\")\n",
    "plt.show()\n",
    "\n",
    "# Top 10 directors\n",
    "director_counts = Counter()\n",
    "df['director'].dropna().apply(lambda x: director_counts.update([d.strip() for d in x.split(',') if d != 'Unknown']))\n",
    "top_directors = director_counts.most_common(10)\n",
    "directors, counts = zip(*top_directors)\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.barplot(x=list(counts), y=list(directors))\n",
    "plt.title(\"Top 10 Directors\")\n",
    "plt.xlabel(\"Number of Titles\")\n",
    "plt.ylabel(\"Director\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. How long is the average content (duration)?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Separate movies and TV shows\n",
    "movies = df[df['type'] == 'Movie']\n",
    "shows = df[df['type'] == 'TV Show']\n",
    "# Extract minutes for movies\n",
    "movies['minutes'] = movies['duration'].str.extract(r'(\\d+)').astype(float)\n",
    "# Extract seasons for TV shows\n",
    "shows['seasons'] = shows['duration'].str.extract(r'(\\d+)').astype(float)\n",
    "print('Average movie duration (minutes):', movies['minutes'].mean())\n",
    "print('Average number of seasons (TV Shows):', shows['seasons'].mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Is there a trend in content addition by month/year?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df['month_added'] = df['date_added'].dt.month\n",
    "trend = df.groupby(['year_added', 'month_added']).size().unstack(fill_value=0)\n",
    "plt.figure(figsize=(12,6))\n",
    "sns.heatmap(trend, cmap='YlGnBu')\n",
    "plt.title(\"Content Added by Year and Month\")\n",
    "plt.xlabel(\"Month\")\n",
    "plt.ylabel(\"Year\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. What are the most common content ratings (TV-MA, PG, etc.)?"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "rating_counts = df['rating'].value_counts().head(10)\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.barplot(x=rating_counts.values, y=rating_counts.index)\n",
    "plt.title(\"Top 10 Content Ratings\")\n",
    "plt.xlabel(\"Number of Titles\")\n",
    "plt.ylabel(\"Rating\")\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
