In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from textblob import TextBlob

In [3]:
# Load the Excel file
file_path = 'add the path of movie data file'
data = pd.read_excel(file_path)

In [4]:
# Limit the dataset to the first 10000 rows
data = data.head(10000)

# Display the first few rows of the dataset
print(data.head())

       id            title  vote_average  vote_count    status  \
0   27205        Inception         8.364       34495  Released   
1  157336     Interstellar         8.417       32571  Released   
2     155  The Dark Knight         8.512       30619  Released   
3   19995           Avatar         7.573       29815  Released   
4   24428     The Avengers         7.710       29166  Released   

          release_date     revenue  runtime  adult  \
0  2010-07-15 00:00:00   825532764      148  False   
1  2014-11-05 00:00:00   701729206      169  False   
2  2008-07-16 00:00:00  1004558444      152  False   
3  2009-12-15 00:00:00  2923706026      162  False   
4  2012-04-25 00:00:00  1518815515      143  False   

                      backdrop_path  ...   original_title  \
0  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...        Inception   
1  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...     Interstellar   
2  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...  The Dark Knight   
3  /vL5LR6WdxWPjLPFRLe133jXWsh5.jp

In [5]:
# Convert 'genres' and 'keywords' columns to strings
data['genres'] = data['genres'].astype(str)
data['keywords'] = data['keywords'].astype(str)

# Combine 'genres' and 'keywords' into a single string
data['combined_features'] = data['genres'] + ' ' + data['keywords']

In [6]:
# Vectorize the combined features using CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(data['combined_features'])

# Compute cosine similarity based on the count matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Create a Series to map movie titles to their index
indices = pd.Series(data.index, index=data['original_title']).drop_duplicates()

In [7]:
def get_content_based_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['original_title'].iloc[movie_indices]

In [8]:
# Get recommendations for the movie 'Avatar'
print(get_content_based_recommendations('Avatar'))

6462                      Cosmic Sin
4863                         Soldier
1205              Planet of the Apes
7449                        Stowaway
3206                 Mission to Mars
9368         Approaching the Unknown
150                            Alien
128     Rogue One: A Star Wars Story
4206                       ìŠ¹ë¦¬í˜¸
96           The Empire Strikes Back
Name: original_title, dtype: object


In [9]:
# Ensure 'overview' column does not have missing values
data_cleaned = data[['original_title', 'overview']].dropna()

# Initialize the TF-IDF Vectorizer and fit-transform the 'overview' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned['overview'])

# Compute cosine similarity based on the TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a Series to map movie titles to their index
indices = pd.Series(data_cleaned.index, index=data_cleaned['original_title']).drop_duplicates()

In [10]:
def get_recommendations_based_on_overview(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data_cleaned['original_title'].iloc[movie_indices]

In [18]:
# Get recommendations for the movie 'Inception'
print(get_recommendations_based_on_overview('Inception'))

7506                             ì‹ ì„¸ê³„
9162              Inception: The Cobol Job
6566                                 House
6385                                Cypher
3712                       The Wrong Missy
389     Mission: Impossible - Rogue Nation
738                   Central Intelligence
7449                              Stowaway
1546                 A History of Violence
8139                              Primeval
Name: original_title, dtype: object


In [13]:
# Function to clean text (lowercase and strip whitespace)
def clean_text(text):
    return text.strip().lower()

# Apply text cleaning to the 'overview' column
data_cleaned['overview'] = data_cleaned['overview'].apply(clean_text)

# Function to analyze sentiment using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

In [14]:
# Analyze sentiment of movie overviews
data_cleaned['sentiment'] = data_cleaned['overview'].apply(analyze_sentiment)

# Initialize the TF-IDF Vectorizer and fit-transform the cleaned 'overview' column
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned['overview'])

# Combine TF-IDF features with sentiment scores
tfidf_features = tfidf_matrix.toarray()
sentiment_scores = data_cleaned['sentiment'].values.reshape(-1, 1)
combined_features = np.hstack((tfidf_features, sentiment_scores))

# Compute cosine similarity based on the combined features
cosine_sim = cosine_similarity(combined_features, combined_features)

# Create a Series to map movie titles to their index
indices = pd.Series(data_cleaned.index, index=data_cleaned['original_title']).drop_duplicates()

In [15]:
def get_recommendations_based_on_nlp(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data_cleaned['original_title'].iloc[movie_indices]

In [19]:
# Get recommendations for the movie 'Inception' based on NLP features
print(get_recommendations_based_on_nlp('Inception'))

7506                             ì‹ ì„¸ê³„
9162              Inception: The Cobol Job
6566                                 House
6385                                Cypher
3712                       The Wrong Missy
389     Mission: Impossible - Rogue Nation
738                   Central Intelligence
1546                 A History of Violence
8139                              Primeval
5055                             12 Rounds
Name: original_title, dtype: object
