In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files

# Load the dataset
movies_df = pd.read_csv('/content/action.csv')
movies_df = movies_df.head(1000)

# Check the dataset structure
print("Dataset Columns:", movies_df.columns)
# print("First 5 rows of the dataset:", movies_df.head())

movies_df['description'] = movies_df['description'].fillna('')
movies_df['genre'] = movies_df['genre'].fillna('')
movies_df['director'] = movies_df['director'].fillna('')
movies_df['star'] = movies_df['star'].fillna('')

# Combine relevant features into a single string feature
movies_df['combined_features'] = movies_df.apply(lambda row: row['description'] + ' ' + row['genre'] + ' ' + row['director'] + ' ' + row['star'], axis=1)

# Now let's vectorize the combined features for similarity calculation
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['combined_features'])

# Compute similarity using cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_recommendations(movie_title, cosine_sim=cosine_sim):
    try:
        # Get the index of the movie that matches the title
        idx = movies_df.index[movies_df['movie_name'] == movie_title].tolist()[0]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies (excluding the input movie itself)
        sim_scores = sim_scores[1:11]  # Skip the first one as it is the same movie

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return movies_df['movie_name'].iloc[movie_indices]
    except IndexError:
        print(f"'{movie_title}' not found in the dataset.")
        return []

input_movie = "Avengers: Endgame"  # Replace with any movie title from your dataset
recommendations = get_recommendations(input_movie)
print("Movies similar to '{}':".format(input_movie))
print(recommendations)


Dataset Columns: Index(['movie_id', 'movie_name', 'year', 'certificate', 'runtime', 'genre',
       'rating', 'description', 'director', 'director_id', 'star', 'star_id',
       'votes', 'gross(in $)'],
      dtype='object')
Movies similar to 'Avengers: Endgame':
99                  Avengers: Infinity War
281             Captain America: Civil War
385    Captain America: The Winter Soldier
53                            The Gray Man
259                Avengers: Age of Ultron
393              Spider-Man: Far from Home
184                           The Avengers
642                   Thor: The Dark World
222                         Thor: Ragnarok
918            Men in Black: International
Name: movie_name, dtype: object
