In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_excel('netflix_titles.xlsx')
#print(df)

In [3]:
# Checking if any descriptions are null (empty)
null_description_values = df['description'].isnull().sum()
print("null description values:", null_description_values)

null description values: 3


In [4]:
# Removing those null values 
dropped_description = df['description'].dropna()
null_description_values2 = dropped_description.isnull().sum()
print("null description values:", null_description_values2)

null description values: 0


In [5]:
# Convert float to string
df['description'] = df['description'].astype(str)
df['description']

0       Before planning an awesome wedding for his gra...
1       Jandino Asporaat riffs on the challenges of ra...
2       With the help of three human allies, the Autob...
3       When a prison ship crash unleashes hundreds of...
4       When nerdy high schooler Dani finally attracts...
                              ...                        
6231    This parody of first-person shooter games, mil...
6232    Marc Maron stars as Marc Maron, who interviews...
6233    Nursery rhymes and original music for children...
6234    Set during the Russian Revolution, this comic ...
6235    This hit sitcom follows the merry misadventure...
Name: description, Length: 6236, dtype: object

## 1. Preprocessing:
- Lowercase: Convert all text to lowercase to ensure uniformity.
- Remove Punctuation: Strip out any punctuation marks.
- Tokenization: Split the description into individual words (tokens).
- Stopword Removal: Remove common words that don’t carry much meaning (e.g., “the”, “is”, “in”).
- Stemming/Lemmatization: Reduce words to their root form (e.g., “running” becomes “run”).

In [6]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    ps = PorterStemmer()  # Initialize stemmer
    tokens = [ps.stem(word) for word in tokens]  # Stemming
    return " ".join(tokens)

In [7]:
# Apply preprocessing
df['processed_description'] = df['description'].apply(preprocess_text)

## 2. Converting Text to Numerical Representation:
- Use sklearn to compute the TF-IDF
- Inverse Document Frequency (IDF): Measures how unique a word is across all documents in the dataset.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the processed descriptions
tfidf_matrix = tfidf.fit_transform(df['processed_description'])

# Convert the matrix to a DataFrame for easier viewing (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())


## 3. Computing Cosine Similarity:
- Cosine similarity measures the angle between two vectors in a multi-dimensional space, which tells us how similar they are. 
- A cosine similarity of 1 means the vectors are identical, and a value of 0 means they are completely different. 
- The output will show a similarity matrix, where each entry represents the similarity score between two movies or TV shows based on their descriptions.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert the similarity matrix into a DataFrame for better readability
similarity_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])

# View similarity scores between titles
print(similarity_df)

title                                        Norm of the North: King Sized Adventure  \
title                                                                                  
Norm of the North: King Sized Adventure                                          1.0   
Jandino: Whatever it Takes                                                       0.0   
Transformers Prime                                                               0.0   
Transformers: Robots in Disguise                                                 0.0   
#realityhigh                                                                     0.0   
...                                                                              ...   
Red vs. Blue                                                                     0.0   
Maron                                                                            0.0   
Little Baby Bum: Nursery Rhyme Friends                                           0.0   
A Young Doctor's Notebook and Ot

## 4. Recommend Based on Similarity:

- Now that you have the similarity scores, you can recommend the most similar titles for any given title. 
    - For example, if you want to recommend similar shows to “Movie X”, you can sort the similarity scores for “Movie X” and pick the top results.

In [10]:
def get_recommendations(title, similarity_matrix, df):
    # Get the index of the title
    idx = df[df['title'] == title].index[0]
    # Get similarity scores for that title
    sim_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the top 5 most similar titles (excluding the first one, which is itself)
    sim_scores = sim_scores[1:6]
    # Get the recommended titles
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Get recommendations for 'Transformers Prime'
recommendations = get_recommendations('Friends', cosine_sim, df)
print(recommendations)


20      Manhattan Romance
5845    Trailer Park Boys
6147         Workin' Moms
5989             Episodes
1524           Warehoused
Name: title, dtype: object
