In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load the dataset
movies_df = pd.read_csv('indian_movies.csv')

# Display the first few rows to understand the structure
movies_df.head()


Unnamed: 0,ID,Movie Name,Year,Timing(min),Rating(10),Votes,Genre,Language
0,tt0398974,Dr. Shaitan,1960,-,-,-,-,hindi
1,tt1702558,Nadir Khan,1968,-,-,-,-,urdu
2,tt0493437,Apna Sapna Money Money,2006,134 min,5.3,1892,"Comedy, Musical, Romance",hindi
3,tt0273405,Aag Aur Sholay,1987,-,2.2,20,-,urdu
4,tt0049595,Parivar,1956,-,7.4,21,"Comedy, Drama, Family",hindi


In [5]:
# Combine relevant features for similarity assessment
# Here we can use both 'Movie Name' and 'Genre' as a combined feature
movies_df['combined'] = movies_df['Movie Name'] + ' ' + movies_df['Genre']


In [7]:
# Initialize the Count Vectorizer
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the combined features
combined_matrix = vectorizer.fit_transform(movies_df['combined'])


In [9]:
# Calculate the cosine similarity matrix for the combined features
cosine_similarities = cosine_similarity(combined_matrix)


In [10]:
# Function to get the top N similar movies
def get_similar_movies(movie_title, cosine_sim, movies_df, top_n=10):
    # Get the index of the movie that matches the title
    idx = movies_df.index[movies_df['Movie Name'] == movie_title].tolist()[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return movies_df.iloc[movie_indices]


In [17]:
# Retrieve and display top 10 similar movies for each query
queries = ['Dilwale Dulhania Le Jayenge', 'Chennai Express', 'Kabir Singh']  # Use movie names that exist in your dataset
results = {}

for query in queries:
    try:
        results[query] = get_similar_movies(query, cosine_similarities, movies_df)
    except IndexError:
        print(f"Movie '{query}' not found in the dataset.")


In [19]:
# Display results
for query, similar_movies in results.items():
    print(f"\nTop 10 movies similar to '{query}':")
    print(similar_movies[['Movie Name', 'Genre']])  # Display titles and genres



Top 10 movies similar to 'Dilwale Dulhania Le Jayenge':
                         Movie Name                               Genre
46003  Dilwale Dulhaniya Le Jayenge                 Romance            
29123         Dulhan Hum Le Jayenge  Comedy, Drama, Romance            
40829                       Dilwale  Action, Drama, Romance            
2749                     Once Again          Drama, Romance            
8023                     Be with Me          Drama, Romance            
22297                   Le Halua Le  Comedy, Drama, Romance            
26648                       Someone          Drama, Romance            
48293                    Do Dilwale                 Romance            
37188         Dulhan Hum Le Jayenge                   Drama            
48617         Badrinath Ki Dulhania  Comedy, Drama, Romance            

Top 10 movies similar to 'Chennai Express':
            Movie Name                                  Genre
23843  Chennai Express  Action, Adventure, C