In [None]:
# 📦 Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plots
%matplotlib inline

# 📂 Load the dataset
# Make sure the dataset is downloaded and in the same directory or provide the path
df = pd.read_csv("Netflix Userbase.csv")
df.head()


In [None]:
# ✅ Task 1: Data Overview

# Check for missing/null values
print("Missing values:")
print(df.isnull().sum())

# Check unique values for categorical columns
categorical_columns = ['gender', 'subscription_type', 'region', 'device', 'payment_method', 'favorite_genre']
for col in categorical_columns:
    print(f"\nUnique values in '{col}':")
    print(df[col].unique())


In [None]:
# ✅ Task 2: Univariate Analysis

numerical_columns = ['age', 'watch_hours', 'monthly_fee', 'churned']

# Distribution plots
for col in numerical_columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Count plots for categorical variables
for col in categorical_columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# ✅ Task 3: Bivariate Analysis

# Compare average watch_hours and monthly_fee across categorical variables
for col in ['subscription_type', 'region', 'device']:
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.barplot(x=col, y='watch_hours', data=df)
    plt.title(f'Average Watch Hours by {col}')
    plt.xticks(rotation=45)

    plt.subplot(1, 2, 2)
    sns.barplot(x=col, y='monthly_fee', data=df)
    plt.title(f'Average Monthly Fee by {col}')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

# Avg watch time per day by favorite_genre
plt.figure(figsize=(8, 4))
sns.barplot(x='favorite_genre', y='avg_watch_time_per_day', data=df, estimator='mean')
plt.title('Average Watch Time per Day by Favorite Genre')
plt.xticks(rotation=45)
plt.show()

# Churn analysis
for col in ['gender', 'region', 'subscription_type', 'payment_method']:
    plt.figure(figsize=(6, 4))
    sns.barplot(x=col, y='churned', data=df)
    plt.title(f'Churn Rate by {col}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# ✅ Task 4: Correlation Analysis

plt.figure(figsize=(10, 6))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


# ✅ Task 5: Insights & Recommendations

Based on the analysis:

1. Customers with lower `watch_hours` appear to have higher churn rates.
2. Premium users generally have higher engagement and lower churn than Basic users.
3. The `region` with the highest average watch_hours can be used to target high-value content.
4. `Subscription_type` has a strong correlation with both `monthly_fee` and churn.
5. `Favorite_genre` seems to influence engagement (avg watch time per day).
6. Some payment methods are associated with higher churn – possibly due to billing issues.
7. Devices used for streaming might affect the ease of use and customer satisfaction.

👉 These insights can help Netflix personalize content, adjust pricing strategies, and focus churn prevention efforts.
