# Netflix Data Project — Clean, Analyze, Visualize & Recommend

This notebook is Colab-ready and includes:
- Data loading (with Colab upload fallback)
- Data cleaning
- EDA visualizations
- A simple content-based recommender using genres + title text

Download this `.ipynb`, open in Colab (File → Upload notebook), and run cells sequentially.

---

In [None]:
# Setup: imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
print('Ready')

## Load dataset
This cell tries `/mnt/data/netflix1.csv` first. If not found (e.g., you're in Colab), it will prompt you to upload the CSV file.

In [None]:
csv_path = '/mnt/data/netflix1.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(f'Loaded dataset from {csv_path}')
else:
    try:
        from google.colab import files
        uploaded = files.upload()
        fn = list(uploaded.keys())[0]
        df = pd.read_csv(fn)
        print(f'Loaded uploaded file: {fn}')
    except Exception as e:
        raise FileNotFoundError(f'Could not find {csv_path} and upload failed. Error: {e}')

# Quick safety: show shape
print('Shape:', df.shape)
display(df.head(5))

## Cleaning
- Drop duplicates
- Fill common nulls with 'Not Given'
- Parse `date_added` and extract year/month/day
- Clean `duration` and extract numeric & unit
- Create `genres_list` and `top_genre`

In [None]:
# Drop duplicates
before = len(df)
df = df.drop_duplicates().reset_index(drop=True)
after = len(df)
print('Dropped duplicates:', before-after)

# Fill common columns
for c in ['director','country','rating']:
    if c in df.columns:
        df[c] = df[c].fillna('Not Given')

# Date parsing
if 'date_added' in df.columns:
    df['date_added'] = df['date_added'].astype(str).str.strip()
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['added_year'] = df['date_added'].dt.year
    df['added_month'] = df['date_added'].dt.month
    df['added_day'] = df['date_added'].dt.day

# Duration cleaning
if 'duration' in df.columns:
    df['duration'] = df['duration'].astype(str).str.strip()
    df['duration_num'] = df['duration'].str.extract(r'(\d+)').astype(float)
    df['duration_unit'] = df['duration'].str.replace(r'(\d+)\s*', '', regex=True).str.strip().replace('', np.nan)

# Genres
if 'listed_in' in df.columns:
    df['listed_in'] = df['listed_in'].astype(str)
    df['genres_list'] = df['listed_in'].apply(lambda x: [g.strip() for g in x.split(',')] if pd.notnull(x) else [])
    df['top_genre'] = df['genres_list'].apply(lambda x: x[0] if isinstance(x,list) and len(x)>0 else 'Not Given')

print('Cleaning done. Columns now:', df.columns.tolist())
display(df.head(3))

## Quick summaries

In [None]:
print('Shape:', df.shape)
print('\nMissing values (top 10):')
print(df.isnull().sum().sort_values(ascending=False).head(10))

print('\nType counts:')
if 'type' in df.columns:
    display(df['type'].value_counts())

print('\nTop countries:')
if 'country' in df.columns:
    display(df['country'].value_counts().head(10))


## Visualizations
Run the following cells to produce the plots.

In [None]:
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10,5)

# Type distribution
if 'type' in df.columns:
    plt.figure()
    sns.countplot(data=df, x='type', order=df['type'].value_counts().index)
    plt.title('Movies vs TV Shows')
    plt.show()

# Ratings
if 'rating' in df.columns:
    r = df['rating'].astype(str).value_counts()
    plt.figure()
    sns.barplot(x=r.index[:12], y=r.values[:12])
    plt.xticks(rotation=45)
    plt.title('Top Ratings')
    plt.show()

# Top countries
if 'country' in df.columns:
    tc = df['country'].astype(str).value_counts().head(10)
    plt.figure()
    sns.barplot(x=tc.index, y=tc.values)
    plt.xticks(rotation=45)
    plt.title('Top 10 countries by content')
    plt.show()

# Monthly releases (if date parsed)
if 'added_month' in df.columns and 'type' in df.columns:
    mm = df[df['type']=='Movie']['added_month'].value_counts().sort_index()
    ms = df[df['type']=='TV Show']['added_month'].value_counts().sort_index()
    plt.figure()
    plt.plot(mm.index, mm.values, marker='o', label='Movies')
    plt.plot(ms.index, ms.values, marker='o', label='TV Shows')
    plt.xticks(range(1,13))
    plt.legend()
    plt.title('Monthly additions')
    plt.show()

# Word cloud of titles
if 'title' in df.columns:
    titles = df['title'].dropna().astype(str).tolist()
    if titles:
        text = ' '.join(titles)
        wc = WordCloud(width=1000, height=400, background_color='white').generate(text)
        plt.figure(figsize=(12,6))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Titles')
        plt.show()


## Simple content-based recommender (genres + title)
We build a combined text field from `top_genre` and `title`, vectorize with TF-IDF, and for a given title return the most similar titles.

In [None]:
# Prepare data for recommender
rec_df = df.copy()
rec_df['rec_text'] = rec_df.get('top_genre', '').fillna('') + ' ' + rec_df.get('title', '').fillna('')
rec_df['rec_text'] = rec_df['rec_text'].astype(str)

# TF-IDF
tf = TfidfVectorizer(stop_words='english')
X = tf.fit_transform(rec_df['rec_text'])
cos_sim = linear_kernel(X, X)
print('Recommender prepared. Rows:', rec_df.shape[0])

# Helper function
def recommend(title, topn=5):
    title = str(title).strip()
    if title not in rec_df['title'].astype(str).values:
        print('Title not found. Try a different one.\nExample titles:')
        display(rec_df['title'].head(10))
        return []
    idx = rec_df[rec_df['title'].astype(str)==title].index[0]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [s for s in sim_scores if s[0] != idx]
    top = sim_scores[:topn]
    results = rec_df.iloc[[i[0] for i in top]][['title','type','top_genre']]
    display(results)
    return results

# Example (uncomment and run with a title that exists in your dataset):
# recommend('Midnight Mass', topn=5)


## Save cleaned CSV (optional)
This will save the cleaned dataset to `/mnt/data/netflix_final_cleaned.csv`. Change the path if you want to save elsewhere.

In [None]:
outp = '/mnt/data/netflix_final_cleaned.csv'
df.to_csv(outp, index=False)
print('Saved cleaned CSV to', outp)


## Done
Download the notebook file `netflix_final_notebook.ipynb` and upload it to Google Colab or run locally in Jupyter. If you want additions (more ML, SQL cleaning, or interactive plots) tell me which and I'll add them.