In [30]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from  sklearn.preprocessing import StandardScaler


In [31]:

# Load the dataset
import pandas as pd
df = pd.read_csv('netflix_titles.csv')

In [32]:
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [33]:
# convert date_added to pandas datetime type
df['date_added'] = pd.to_datetime(df['date_added'],format="mixed")

In [34]:
# fill null values in director, cast and country columns to be set to 'unknown', and for duration to be 'Not added'
df["director"]=df["director"].fillna("Unknown")
df["cast"]=df["cast"].fillna("Unknown")
df["country"]=df["country"].fillna("Unknown")
df['duration'].replace(np.nan,'Not Added', inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['duration'].replace(np.nan,'Not Added', inplace= True)


In [35]:


# fill the rest of null values to previous and next values
df = df.ffill().bfill()


In [36]:
# drop rows have 'duration' == 'Not Added'
idx = df[df.duration == 'Not Added'].index
df.drop(idx, inplace= True)
df.reset_index(drop= True, inplace= True)

In [37]:
## Feature Engineering

In [38]:
# Create features year_added, month_added & month_name
df['year_added'] = df['date_added'].dt.year.astype(int)
df['month_added'] = df['date_added'].dt.month
df['month_name'] = df['date_added'].dt.month_name()

In [39]:
# Create features year_added, month_added & month_name
df['year_added'] = df['date_added'].dt.year.astype(int)
df['month_added'] = df['date_added'].dt.month
df['month_name'] = df['date_added'].dt.month_name()



In [40]:
# create 'season_count' col for TV Shows & 'duration' col for movies
df['season_count'] = df.duration.apply(lambda x: x.split(' ')[0] if 'Season' in x else np.nan)
df['duration'] = df.duration.apply(lambda x: x.split(' ')[0] if 'Season' not in x else np.nan)

In [41]:
# convert ['duration', 'release_year', 'season_count'] to numeric format
cols = ['duration', 'release_year', 'season_count']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [42]:

# Separate features and target variable
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert back to DataFrame if needed
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['type'])], axis=1)

# Encode categorical variables
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_encoded, y_resampled, test_size=0.2, random_state=42)

# Initializing classifiers
rf_classifier = RandomForestClassifier()
logistic_classifier = LogisticRegression()

rf_classifier.fit(X_train, y_train)
logistic_classifier.fit(X_train, y_train)

# Making predictions
rf_pred = rf_classifier.predict(X_test)
logistic_pred = logistic_classifier.predict(X_test)

# Evaluating RandomForestClassifier
rf_accuracy = accuracy_score(y_test, rf_pred)

# Evaluating Logistic Regression
logistic_accuracy = accuracy_score(y_test, logistic_pred)

# Print the accuracies
print("RandomForestClassifier Accuracy:", rf_accuracy)
print("Logistic Regression Accuracy:", logistic_accuracy)





RandomForestClassifier Accuracy: 1.0
Logistic Regression Accuracy: 0.9881729200652528


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
df_resampled.head(2)

Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name,season_count,type
0,s1,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90.0,Documentaries,"As her father nears the end of his life, filmm...",2021,9,September,,Movie
1,s2,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,September,2.0,TV Show


In [44]:


# Load your dataset (replace 'your_dataset.csv' with the actual dataset)
df = pd.read_csv('netflix_titles.csv')

# Separate features and target variable
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert back to DataFrame if needed
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['type'])], axis=1)

# Encode categorical variables
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_encoded, y_resampled, test_size=0.2, random_state=42)

# Initializing classifiers
rf_classifier = RandomForestClassifier()
logistic_classifier = LogisticRegression()

# Fit the models on the training set
rf_classifier.fit(X_train, y_train)
logistic_classifier.fit(X_train, y_train)

# Making predictions on both the training and test sets for both classifiers
# Random Forest
rf_train_pred = rf_classifier.predict(X_train)
rf_test_pred = rf_classifier.predict(X_test)

# Logistic Regression
logistic_train_pred = logistic_classifier.predict(X_train)
logistic_test_pred = logistic_classifier.predict(X_test)

# Evaluating RandomForestClassifier (train and test accuracy)
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)

# Evaluating Logistic Regression (train and test accuracy)
logistic_train_accuracy = accuracy_score(y_train, logistic_train_pred)
logistic_test_accuracy = accuracy_score(y_test, logistic_test_pred)

# Print the train and test accuracies
print("RandomForestClassifier Train Accuracy:", rf_train_accuracy)
print("RandomForestClassifier Test Accuracy:", rf_test_accuracy)

print("Logistic Regression Train Accuracy:", logistic_train_accuracy)
print("Logistic Regression Test Accuracy:", logistic_test_accuracy)




RandomForestClassifier Train Accuracy: 1.0
RandomForestClassifier Test Accuracy: 0.9942927028128822
Logistic Regression Train Accuracy: 0.8456519522887145
Logistic Regression Test Accuracy: 0.849164288626172


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
# Train and test data with K-Nearest Neighbors Classifier





In [46]:


# Separate features and target variable
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert back to DataFrame if needed
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['type'])], axis=1)

# Encode categorical variables
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Feature scaling (KNN benefits from feature scaling)
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)

# Initialize K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(X_train, y_train)

# Make predictions with KNN
y_train_pred = knn_classifier.predict(X_train)  # Predictions on training set
y_test_pred = knn_classifier.predict(X_test)  # Predictions on test set

# Calculate the training and test accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the train and test accuracies
print(f"K-Nearest Neighbors Classifier Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"K-Nearest Neighbors Classifier Test Accuracy: {test_accuracy * 100:.2f}%")




K-Nearest Neighbors Classifier Train Accuracy: 93.78%
K-Nearest Neighbors Classifier Test Accuracy: 90.38%


In [47]:


# Step 1: Preprocess the data
# Fill missing values with an empty string
df['director'] = df['director'].fillna('')
df['cast'] = df['cast'].fillna('')
df['listed_in'] = df['listed_in'].fillna('')
df['description'] = df['description'].fillna('')

# Handle the duration column
def process_duration(duration):
    if 'min' in duration:
        return int(duration.replace(' min', ''))  # Extract minutes
    elif 'Season' in duration:
        return int(duration.replace(' Seasons', '').replace(' Season', '')) * 60  # Convert seasons to hours (as a proxy)
    else:
        return 0

df['duration'] = df['duration'].fillna('0').apply(process_duration)

# Step 2: Combine features including description for similarity computation
df['combined_features'] = df['type'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['listed_in'] + ' ' + df['description']

# Step 3: Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Step 4: Compute cosine similarity between all movies/shows
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 5: Build a function that recommends shows or movies based on similarity score
def get_recommendations(title, cosine_sim=cosine_sim):
    # Try to get the index of the movie that matches the title
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
    except IndexError:
        return "Sorry, the title you entered was not found in the dataset."

    # Get the pairwise similarity scores of all shows/movies with that title
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the shows/movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar shows/movies
    sim_scores = sim_scores[1:11]

    # Get the show/movie indices
    show_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar shows/movies
    return df['title'].iloc[show_indices]

# Step 6: Test the recommendation function with error handling
print(get_recommendations('Sankofa'))


8353                    The Hunt
5044           When We First Met
7037               I Am Jane Doe
3094                     The App
8238           The Carter Effect
7850    Reggie Yates Outside Man
4056                      Losers
7904           Running for Grace
6519          Comedy Bang! Bang!
227                  Really Love
Name: title, dtype: object


In [48]:




# Combining text features for content-based similarity
df['combined_features'] = df['type'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['listed_in'] + ' ' + df['description']

# Step 1: Content-Based Model (TF-IDF + Cosine Similarity)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# KNN Preparation
# Step 2: KNN Model (for classification)

# Separate features and target variable for KNN
X = df.drop(columns=['type'])  # Features
y = df['type']  # Target variable

# Apply oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Encode categorical variables for KNN
label_encoder = LabelEncoder()
X_resampled_encoded = X_resampled.apply(label_encoder.fit_transform)

# Feature scaling (KNN benefits from feature scaling)
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)

# Initialize K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_classifier.fit(X_train, y_train)

# Function to get recommendations from content-based model
def get_content_recommendations(title, cosine_sim=cosine_sim):
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
    except IndexError:
        return []

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    show_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[show_indices]

# Voting Classifier: Combine KNN and Content-Based Model
def voting_classifier(title, cosine_sim=cosine_sim, knn_model=knn_classifier, weight_knn=0.5, weight_content=0.5):
    # Step 1: Get content-based recommendations
    content_based_recommendations = get_content_recommendations(title)

    # Step 2: Get KNN predictions
    # Note: We use KNN to predict the "type" (TV Show/Movie) based on numeric and categorical features
    # For simplicity, we simulate a scenario where we want to predict the type of the given title.
    try:
        idx = df[df['title'].str.contains(title, case=False)].index[0]
        knn_pred = knn_model.predict([X_resampled_scaled[idx]])[0]  # KNN predicts the type
    except IndexError:
        return "Title not found in the dataset."

    # Step 3: Combine content-based and KNN results
    if len(content_based_recommendations) > 0:
        return {
            "KNN_Prediction": knn_pred,
            "Content-Based_Recommendations": content_based_recommendations,
        }
    else:
        return "No content-based recommendations found."

# Test the combined voting classifier with a specific title
result = voting_classifier('Sankofa')
print(result)




{'KNN_Prediction': 'Movie', 'Content-Based_Recommendations': 8353                    The Hunt
5044           When We First Met
7037               I Am Jane Doe
3094                     The App
8238           The Carter Effect
7850    Reggie Yates Outside Man
4056                      Losers
7904           Running for Grace
6519          Comedy Bang! Bang!
227                  Really Love
Name: title, dtype: object}


In [49]:
# Test the recommendation system with another movie or TV show
result = voting_classifier('Kota Factory')
print(result)


{'KNN_Prediction': 'TV Show', 'Content-Based_Recommendations': 8775         Yeh Meri Family
3466            Girls Hostel
2353           Chaman Bahaar
2472                  Betaal
4265    Single Ladies Senior
266     The Creative Indians
366                  Glow Up
751           Guru Aur Bhole
1038          Dancing Angels
3464       Engineering Girls
Name: title, dtype: object}
