In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, FastText
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
data = pd.read_csv("movie.csv")
data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [27]:
movies = data[['Title', 'Plot']]

In [28]:
# BOW (Bag-of-Words)
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(movies['Plot'])

In [29]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['Plot'])

In [30]:
# Word2Vec
sentences = [plot.split() for plot in movies['Plot']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [31]:
# FastText
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [33]:
def content_based_recommender(plot_description, method='bow', top_n=5):
    if method == 'bow':
        plot_vector = count_vectorizer.transform([plot_description])
        similarity_matrix = cosine_similarity(plot_vector, bow_matrix)
    elif method == 'tfidf':
        plot_vector = tfidf_vectorizer.transform([plot_description])
        similarity_matrix = cosine_similarity(plot_vector, tfidf_matrix)
    elif method == 'fasttext':
        plot_vector = sum(fasttext_model.wv[word] for word in plot_description.split() if word in fasttext_model.wv)
        similarity_matrix = cosine_similarity([plot_vector], [fasttext_model.wv[word] for word in movies['Plot']])
    else:
        print("Invalid method.")
        return
    
    top_indices = similarity_matrix.argsort()[0][-top_n:][::-1]
    recommended_movies = movies.iloc[top_indices]['Title']
    return recommended_movies.tolist()

In [39]:
plot_description = "A young boy embarks on a magical adventure to save his family."
recommended_movies_bow = content_based_recommender(plot_description, method='bow', top_n=5)
recommended_movies_tfidf = content_based_recommender(plot_description, method='tfidf', top_n=5)
recommended_movies_fasttext = content_based_recommender(plot_description, method='fasttext', top_n=5)
print("Recommended Movies (BOW):", recommended_movies_bow)
print("Recommended Movies (tfidf):", recommended_movies_tfidf)
print("Recommended Movies (fasttext):", recommended_movies_fasttext)

Recommended Movies (BOW): ['Annaiyum Pithavum', 'Mitti Wajaan Maardi', 'Nalla Kaalam Porandaachu', 'The Homecoming', 'Honeysuckle Rose']
Recommended Movies (tfidf): ['The Mirror Boy', 'The Idol', 'Puella Magi Madoka Magica (Part 2)', 'Puella Magi Madoka Magica (Part 1)', 'Yona Yona Penguin']
Recommended Movies (fasttext): ['The Summer Is Gone', 'High Jinks in Society', 'Gone with the Bullets', 'Ekti Tarar Khonje', 'Runway']
