### Import Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Import Data

In [2]:
df=pd.read_csv('netflix_titles.csv')

In [3]:
df.dropna(subset = ['description'], inplace=True)
df.dropna()
df = df[['show_id', 'title', 'cast', 'description','listed_in', 'director']]

### CORPUS Generation

In [4]:
def generate_corpus(title,cast,description,listed_in,director):
    return title + " " + cast + " " + description+ " " + listed_in+ " " + director

In [5]:
corpus = []
for i in range(len(df)):
    title_str = str(df.iloc[i]['title'])
    cast_str = str(df.iloc[i]['cast'])
    description_str = str(df.iloc[i]['description'])
    listed_in_str = str(df.iloc[i]['listed_in'])
    director_str = str(df.iloc[i]['director'])
    
    corpus.append(generate_corpus(title_str, cast_str, description_str,listed_in_str,director_str))

In [6]:
# df.drop(columns=['id','genres', 'overview', 'tagline'], inplace=True)
df['corpus'] = corpus

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['corpus'])

In [8]:
cos_mat = linear_kernel(tfidf_matrix, tfidf_matrix)
cos_mat.shape

(8807, 8807)

### Recommendation using Cosine Similarity

In [9]:
def get_recommendations(movie, n):
    index = df[df['title']== movie].index[0]
    similar_movies = sorted(list(enumerate(cos_mat[index])), reverse=True, key=lambda x: x[1]) 
    recomm = []
    for i in similar_movies[1:n+1]:
        recomm.append(df.iloc[i[0]].title)
    return recomm

In [10]:
get_recommendations("Ganglands", 5)

['Earth and Blood',
 "The Eagle of El-Se'eed",
 'Warrior',
 'All Hail King Julien: Exiled',
 'Elite Short Stories: Carla Samuel']

### Recommendation using Roberta

In [11]:
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def get_recommendations(movie, n):
    if movie not in df['title'].values:
        print(f"Movie '{movie}' not found in the DataFrame.")
        return []

    index = df[df['title'] == movie].index[0]
    inputs = tokenizer(df.iloc[index]['corpus'], return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    movie_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
    cos_similarities = []
    for i in range(len(df)):
        inputs_i = tokenizer(df.iloc[i]['corpus'], return_tensors='pt', max_length=512, truncation=True)
        with torch.no_grad():
            outputs_i = model(**inputs_i)
        movie_embedding_i = outputs_i.last_hidden_state[:, 0, :].squeeze(0).numpy()
        similarity = np.dot(movie_embedding, movie_embedding_i) / (np.linalg.norm(movie_embedding) * np.linalg.norm(movie_embedding_i))
        cos_similarities.append(similarity)

    similar_movies = sorted(list(enumerate(cos_similarities)), reverse=True, key=lambda x: x[1])

    recommendations = []
    for i in similar_movies[1:n+1]:
        recommendations.append(df.iloc[i[0]]['title'])
    
    return recommendations

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
recommendations = get_recommendations('Ganglands', n=5)
print(recommendations)

['Dealer', 'Gang of the Caribbean', 'Carbon', 'The Stronghold', 'The Misadventures of Hedi and Cokeman']
