# installing packages 

## Required packages are listed in requirements.txt file

In [126]:
!pip install -r requirements.txt



## importing libraries

In [128]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

## load a dataset

In [172]:
def load_data(file_path):
    """Loads dataset from a CSV file and handles missing values."""
    df = pd.read_csv(file_path)
    return data_processing(df)

## data preprocessing

In [170]:
def data_processing(df):
    df = df.dropna()
    df = df.dropDuplicates()
    
    return df

## Extract Genre from Query

In [132]:
def extract_genre(user_query, dataset):
    """Extracts a relevant genre keyword from the user query."""
    all_genres = set(dataset['genre'].str.split(',').explode().str.strip().unique())
    detected_genre = [genre for genre in all_genres if genre.lower() in user_query.lower()]
    return detected_genre[0] if detected_genre else None



## Boost Space-Related Queries

In [134]:

def enhance_query(user_query):
    """Boosts space-related words in the query for better recommendations."""
    if "space" in user_query.lower():
        user_query += " galaxy universe planets interstellar sci-fi alien cosmos astronaut"
    return user_query

## Building  TF-IDF Matrix

In [136]:
def build_tfidf_matrix(dataset, user_query):
    """Builds the TF-IDF vectorizer and transforms the dataset and user query."""
    dataset['combined_text'] = dataset['genre'] + ' ' + dataset['overview']
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(dataset['combined_text'].tolist() + [user_query])
    return vectorizer, tfidf_matrix

## Computing Similarity

In [138]:
def compute_similarity(tfidf_matrix):
    """Computes cosine similarity between the user query and dataset and normalizes it."""
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    scaler = MinMaxScaler()
    similarity_scores = scaler.fit_transform(similarity_scores.reshape(-1, 1)).flatten()
    return similarity_scores

## Get Top N Recommendations

In [140]:
def get_top_recommendations(dataset, similarity_scores, detected_genre, top_n=5):
    """Gets the top N recommended movies sorted by similarity, popularity, and rating."""

    # Add similarity scores to dataset
    dataset = dataset.copy()
    dataset['similarity'] = similarity_scores

    # Prioritize detected genre but do not eliminate other matches
    if detected_genre:
        dataset['genre_match'] = dataset['genre'].apply(lambda g: 1 if detected_genre in g else 0)
    else:
        dataset['genre_match'] = 0

    # Sort based on similarity, genre match, popularity, and rating
    recommendations = dataset.sort_values(
        by=[ 'similarity'], 
        ascending=[ False]
    )

    return recommendations.head(top_n)

## Recommend Movies

In [155]:
def recommend_movies(file_path, user_query, n):
    """Main function to generate movie recommendations."""
    dataset = load_data(file_path)

    if not user_query.strip():
        print("Error: Query cannot be empty.")
        return

    # Enhance query for better keyword matches
    user_query = enhance_query(user_query)

    # Extract genre from user query (soft filter, not strict)
    detected_genre = extract_genre(user_query, dataset)

    if detected_genre:
        print("\n")
    else:
        print("\nNo specific genre detected. Searching based on full text similarity.")

    # Build TF-IDF Matrix
    vectorizer, tfidf_matrix = build_tfidf_matrix(dataset, user_query)

    # Compute Similarity
    similarity_scores = compute_similarity(tfidf_matrix)

    # Get Recommendations
    recommendations = get_top_recommendations(dataset, similarity_scores, detected_genre, n)

    # Display recommendations
    print("\nTop recommended movies:")
    print(recommendations[['title','similarity']].to_string(index=False))


##  Main

In [174]:
if __name__ == "__main__":
    file_path = "movie.csv"  # Make sure the dataset file is in the same directory
    user_query = input("Enter your movie preference: ").strip()
    
    recommend_movies(file_path, user_query, n=5)


Enter your movie preference:  i like action movie set in space


AttributeError: 'DataFrame' object has no attribute 'dropDuplicates'