In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\23324\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Import dataset
movies_df = pd.read_csv('./tmdb_5000_movies.csv')
#print(movies_df.head())
columns = ['original_title', 'overview'] 
relevant_columns_df = movies_df[columns]
print(relevant_columns_df)

                                original_title  \
0                                       Avatar   
1     Pirates of the Caribbean: At World's End   
2                                      Spectre   
3                        The Dark Knight Rises   
4                                  John Carter   
...                                        ...   
4798                               El Mariachi   
4799                                 Newlyweds   
4800                 Signed, Sealed, Delivered   
4801                          Shanghai Calling   
4802                         My Date with Drew   

                                               overview  
0     In the 22nd century, a paraplegic Marine is di...  
1     Captain Barbossa, long believed to be dead, ha...  
2     A cryptic message from Bond’s past sends him o...  
3     Following the death of District Attorney Harve...  
4     John Carter is a war-weary, former military ca...  
...                                                

In [4]:

stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()
# Preprocessing step
def preprocess_overview(overview):
    if pd.notnull(overview):  # Check if overview is not NaN
        tokens = word_tokenize(overview.lower())
        tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        sentiment_score = sia.polarity_scores(overview)['compound']
        
        positive_tokens = [token for token in tokens if sia.polarity_scores(token)['compound'] >= 0.5]
        # Keep original overview for positive tokens
        if positive_tokens:
            return overview  
    # Return empty string for movies without positive tokens (rows will be dropped later)
    return ''  

In [5]:
# Preprocess the 'overview' column
relevant_columns_df['processed_overview'] = relevant_columns_df['overview'].apply(preprocess_overview)

# Drop rows with empty 'processed_overview'
relevant_columns_df = relevant_columns_df[relevant_columns_df['processed_overview'] != '']

# Print processed data
print(relevant_columns_df[['original_title', 'processed_overview']])



                          original_title  \
5                           Spider-Man 3   
6                                Tangled   
7                Avengers: Age of Ultron   
9     Batman v Superman: Dawn of Justice   
10                      Superman Returns   
...                                  ...   
4786                    Breaking Upwards   
4788                      Pink Flamingos   
4794        Sanctuary: Quite a Conundrum   
4800           Signed, Sealed, Delivered   
4801                    Shanghai Calling   

                                     processed_overview  
5     The seemingly invincible Spider-Man goes up ag...  
6     When the kingdom's most wanted-and most charmi...  
7     When Tony Stark tries to jumpstart a dormant p...  
9     Fearing the actions of a god-like Super Hero l...  
10    Superman returns to discover his 5-year absenc...  
...                                                 ...  
4786  'Breaking Upwards' explores a young, real-life...  
4788  N

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_columns_df['processed_overview'] = relevant_columns_df['overview'].apply(preprocess_overview)


In [6]:
overviews = relevant_columns_df['processed_overview']

# Create TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(overviews)

# Compute cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.02824542 0.04317569 ... 0.01938023 0.0459021  0.02267121]
 [0.02824542 1.         0.04954739 ... 0.04587378 0.0504558  0.03384842]
 [0.04317569 0.04954739 1.         ... 0.02725042 0.04624032 0.03559308]
 ...
 [0.01938023 0.04587378 0.02725042 ... 1.         0.02159189 0.02704411]
 [0.0459021  0.0504558  0.04624032 ... 0.02159189 1.         0.04748167]
 [0.02267121 0.03384842 0.03559308 ... 0.02704411 0.04748167 1.        ]]


In [7]:
indices = pd.Series(relevant_columns_df.index, index=relevant_columns_df['original_title']).drop_duplicates()
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

     # Get the similarity scores of all movies with that movie
    sim_scores = cosine_sim[idx]

    # Get the indices of the top 10 most similar movies
    movie_indices = np.argsort(sim_scores)[::-1][1:11]

    # Return the top 10 most similar movies
    return relevant_columns_df['original_title'].iloc[movie_indices]

In [8]:
get_recommendations('Superman Returns')

2129                                     The Black Hole
1294                                           Serenity
4044                                         Go for It!
17          Pirates of the Caribbean: On Stranger Tides
3103                     La femme de chambre du Titanic
310                             In the Heart of the Sea
1875                                          Elizabeth
171     Master and Commander: The Far Side of the World
3497                                       The Greatest
2446                                            My Girl
Name: original_title, dtype: object