In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies_path = 'movies.csv'
df_movies = pd.read_csv(movies_path)

def preprocess_text(text):
    """Preprocess text by converting to lowercase and stripping whitespace."""
    return text.lower().strip()

def recommend_movies(user_query, dataset, top_n=5):
    """Recommend top N similar movies based on content similarity."""
    dataset['processed_genres'] = dataset['genres'].fillna('').apply(preprocess_text)
    
    # Using TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(dataset['processed_genres'])
    
    # Transforming user query to TF-IDF vector
    query_vector = vectorizer.transform([preprocess_text(user_query)])
    
    # Compute the cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top N recommendations
    top_indices = np.argsort(similarity_scores)[-top_n:][::-1]
    recommendations = dataset.iloc[top_indices]
    
    return recommendations[['title', 'genres']]

if __name__ == "__main__":
    user_input = input("Enter your movie preferences: ")
    recommendations = recommend_movies(user_input, df_movies)
    
    print(recommendations)

    # Salary Expectation
    salary_expectation = "$23-35 per hour"
    print(f"\nSalary Expectation: {salary_expectation}")


Enter your movie preferences: salary expectation
                                    title                        genres
9741  Andrew Dice Clay: Dice Rules (1991)                        Comedy
3244                   Cats & Dogs (2001)               Children|Comedy
3251           Beach Blanket Bingo (1965)                Comedy|Musical
3250                 Another Woman (1988)                         Drama
3249                         Alice (1990)  Comedy|Drama|Fantasy|Romance

Salary Expectation: $23-35 per hour
