<a href="https://colab.research.google.com/github/sriharshitha12/python-project_1261/blob/main/netflix%20final%20code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === Netflix Content-Based Recommender (95%+ Accuracy, Error-Free) ===

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# === Load Dataset (handles both .csv and .csv.csv) ===
try:
    df = pd.read_csv("netflix_titles.csv.csv")
except FileNotFoundError:
    df = pd.read_csv("netflix_titles.csv")

# === Data Cleaning ===
df.dropna(subset=['title', 'description', 'listed_in'], inplace=True)
df.fillna('', inplace=True)

# Combine key metadata fields
df['content'] = df['title'] + " " + df['listed_in'] + " " + df['description']

# === Encode Target (Movie/TV Show) ===
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

# === TF-IDF Vectorization (optimized) ===
tfidf = TfidfVectorizer(stop_words='english', max_features=8000)
X = tfidf.fit_transform(df['content'])
y = df['type_encoded']

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Build Ensemble Model ===
rf = RandomForestClassifier(n_estimators=200, max_depth=25, random_state=42)
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
lr = LogisticRegression(max_iter=400)
nb = MultinomialNB()

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('lr', lr), ('nb', nb)],
    voting='soft'
)

# === Train ===
ensemble.fit(X_train, y_train)

# === Evaluate ===
y_pred = ensemble.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')

print(f"\n‚úÖ Model Accuracy: {acc*100:.2f}%")
print(f"‚úÖ Model Precision: {prec*100:.2f}%")

# === Content Similarity for Recommendations ===
cosine_sim = cosine_similarity(X, X)
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def recommend(title, num=5):
    """Recommend similar titles based on content similarity."""
    if title not in indices:
        return f"‚ùå '{title}' not found in dataset. Try another title."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num+1]
    rec_indices = [i[0] for i in sim_scores]
    return df[['title', 'listed_in', 'description']].iloc[rec_indices]

# === Example Output ===
print("\nüé¨ Recommended titles similar to 'Breaking Bad':\n")
print(recommend("Breaking Bad"))



‚úÖ Model Accuracy: 99.60%
‚úÖ Model Precision: 99.60%

üé¨ Recommended titles similar to 'Breaking Bad':

                                          title  \
2606                            Extracurricular   
4118                                Iron Ladies   
4143                                     Sparta   
5352  Have You Ever Fallen in Love, Miss Jiang?   
2931                           Better Call Saul   

                                              listed_in  \
2606  Crime TV Shows, International TV Shows, Korean...   
4118  International TV Shows, Romantic TV Shows, TV ...   
4143  Crime TV Shows, International TV Shows, TV Dramas   
5352  Crime TV Shows, International TV Shows, TV Dramas   
2931             Crime TV Shows, TV Comedies, TV Dramas   

                                            description  
2606  A model high school student who's steeped in a...  
4118  Three 30-something women in high-level jobs at...  
4143  While investigating the mysterious death of a ...