# Netflix Analysis — Improved

*Colab-ready. Uses the same dataset path: `/content/Netflix Dataset.csv`*

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)


In [None]:
# Load data
DATA_PATH = '/content/Netflix Dataset.csv'
df = pd.read_csv(DATA_PATH)
df.shape


In [None]:
# Quick peek
df.head()


In [None]:
# Basic info and types
df.info()


In [None]:
# Clean column names
df.columns = [c.strip().replace(' ', '_').lower() for c in df.columns]
df.columns


In [None]:
# Standardize missing values
df = df.replace({'\n': np.nan, '': np.nan})
df = df.dropna(how='all')


In [None]:
# Convert common columns
for col in ['release_year']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')


In [None]:
# Extract common derived columns
if 'date_added' in df.columns:
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['added_year'] = df['date_added'].dt.year
    df['added_month'] = df['date_added'].dt.month


In [None]:
# Split multiple categories (e.g., genres, country) into lists for analysis
def split_to_list(x):
    if pd.isna(x): return []
    return [i.strip() for i in str(x).split(',') if i.strip()]

for col in ['country', 'listed_in', 'cast', 'director']:
    if col in df.columns:
        df[col+'_list'] = df[col].apply(split_to_list)


In [None]:
# Helper: explode column and count
def top_n_from_list(col_list_name, n=15):
    s = df[col_list_name].explode().dropna()
    return s.value_counts().head(n)

# Top countries
if 'country_list' in df.columns:
    top_countries = top_n_from_list('country_list', 20)
    top_countries


In [None]:
# Top categories/listings
if 'listed_in_list' in df.columns:
    top_genres = top_n_from_list('listed_in_list', 20)
    top_genres


In [None]:
# Content type distribution
if 'type' in df.columns:
    counts = df['type'].value_counts()
    counts.plot.pie(autopct='%1.1f%%', ylabel='', title='Content Type Distribution')
    plt.show()


In [None]:
# Release year distribution
if 'release_year' in df.columns:
    sns.histplot(df['release_year'].dropna(), bins=20)
    plt.title('Release Year Distribution')
    plt.xlabel('Year')
    plt.show()


In [None]:
# Added year trend (if available)
if 'added_year' in df.columns:
    trend = df.groupby('added_year').size()
    trend.plot(marker='o')
    plt.title('Titles Added by Year')
    plt.ylabel('Count')
    plt.xlabel('Year Added')
    plt.show()


In [None]:
# Duration parsing (for movies with duration like '90 min')
if 'duration' in df.columns:
    def parse_duration(x):
        try:
            return int(str(x).split()[0])
        except:
            return np.nan
    df['duration_min'] = df['duration'].apply(parse_duration)
    sns.histplot(df['duration_min'].dropna(), bins=20)
    plt.title('Duration (minutes)')
    plt.xlabel('Minutes')
    plt.show()


In [None]:
# Top directors
if 'director_list' in df.columns:
    top_directors = top_n_from_list('director_list', 20)
    top_directors


In [None]:
# Top actors
if 'cast_list' in df.columns:
    top_actors = top_n_from_list('cast_list', 30)
    top_actors


In [None]:
# Correlations for numeric features
num_cols = df.select_dtypes(include=['int64','float64','Int64']).columns.tolist()
num_cols = [c for c in num_cols if df[c].nunique()>1]
if len(num_cols) >= 2:
    corr = df[num_cols].corr()
    sns.heatmap(corr, annot=True, fmt='.2f')
    plt.title('Numeric Correlations')
    plt.show()


In [None]:
# Interactive plot: top genres (plotly)
if 'listed_in_list' in df.columns:
    top_genres = top_n_from_list('listed_in_list', 15).reset_index()
    top_genres.columns = ['genre','count']
    fig = px.bar(top_genres, x='genre', y='count', title='Top Genres', text='count')
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()


In [None]:
# Save cleaned snapshot for reproducibility
os.makedirs('data', exist_ok=True)
clean_path = 'data/netflix_cleaned.csv'
df.to_csv(clean_path, index=False)
clean_path
