# 🎓 Assignment 3: Netflix Data Science Capstone
This notebook includes:
- Advanced EDA
- Feature Engineering
- Model Tuning & Evaluation
- SHAP/LIME Interpretability
- Streamlit + Docker Deployment Plan

## 🔍 Section 1: Advanced EDA & Feature Insights

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("netflix_titles.csv")
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['description'] = df['description'].fillna('')
df['country'] = df['country'].fillna('Unknown')

# Correlation heatmap (only works on numeric)
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
sns.heatmap(df[['release_year']].corr(), annot=True)
plt.title("Correlation Matrix")
plt.show()


## 🛠️ Section 2: Feature Engineering & Transformation

In [None]:

from sklearn.preprocessing import LabelEncoder

df['type'] = LabelEncoder().fit_transform(df['type'])
df['rating'] = df['rating'].fillna('Unknown')
df['rating_encoded'] = LabelEncoder().fit_transform(df['rating'])

# Optional: feature drop or transformation
df_model = df[['type', 'release_year', 'rating_encoded']].dropna()
X = df_model.drop('type', axis=1)
y = df_model['type']


## 🤖 Section 3: Model Training & Hyperparameter Tuning

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None]
}
model = RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
joblib.dump(best_model, 'best_model.pkl')

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


## 🧠 Section 4: SHAP Model Explainability

In [None]:

import shap

explainer = shap.Explainer(best_model, X_test)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)


## 🚀 Section 5: Streamlit + Docker Deployment

In [None]:

# To use in Streamlit:
# model = joblib.load("best_model.pkl")
# prediction = model.predict([[year, rating_encoded]])

# Dockerfile sample:
# ------------------
# FROM python:3.10
# COPY . /app
# WORKDIR /app
# RUN pip install -r requirements.txt
# EXPOSE 8501
# CMD ["streamlit", "run", "streamlit_app.py"]
