<a href="https://colab.research.google.com/github/rubin222/Movie-ticket-booking-with-content-based-filtering/blob/main/main_recommendation_ml_part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# File paths
file_paths = [
    "/content/IMDB-Movie-Dataset(2023-1951).csv",
    "/content/Top_10000_Movies.csv",
    "/content/tmdb_5000_movies.csv"
]

# Define possible column names
column_mapping = {
    "title": ["title", "original_title", "movie_name"],
    "genres": ["genres", "genre", "movie_genre"],
    "overview": ["overview"]
}

# Load and standardize datasets
dfs = []
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', engine='python')
        df.columns = df.columns.str.lower().str.strip()

        print(f"\n📂 File: {file_path}")
        print("Original Columns:", df.columns.tolist())

        # Rename columns
        new_column_names = {}
        for standard_name, variations in column_mapping.items():
            for variation in variations:
                if variation in df.columns:
                    new_column_names[variation] = standard_name
                    break
        df.rename(columns=new_column_names, inplace=True)

        print("Renamed Columns:", df.columns.tolist())

        # Ensure required columns exist
        if all(col in df.columns for col in column_mapping.keys()):
            dfs.append(df)
        else:
            print(f"⚠ Skipping {file_path} due to missing necessary columns after renaming.")
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")

# Merge datasets
if not dfs:
    raise ValueError("None of the uploaded files contain the required movie details. Check the column names above.")

df = pd.concat(dfs, ignore_index=True)
print("\n✅ Successfully merged datasets!\n")

# Data preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub("\\d+", "", text)
    return text

df['title'] = df['title'].fillna("Unknown")
df['genres'] = df['genres'].fillna("Unknown")
df['overview'] = df['overview'].fillna("Unknown")
df['combined_features'] = df['title'] + " " + df['genres'] + " " + df['overview']
df['combined_features'] = df['combined_features'].apply(clean_text)

# Convert text to feature vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Movie recommendation function
def recommend_movies(movie_title, num_recommendations=5):
    if movie_title not in df['title'].values:
        return f"Movie '{movie_title}' not found in dataset. Please enter a valid movie title."

    idx = df.index[df['title'] == movie_title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices].tolist()

# Example usage
movie_name = input("Enter a movie title: ")
recommended_movies = recommend_movies(movie_name)
print("Recommended Movies:", recommended_movies)


📂 File: /content/IMDB-Movie-Dataset(2023-1951).csv
Original Columns: ['unnamed: 0', 'movie_id', 'movie_name', 'year', 'genre', 'overview', 'director', 'cast']
Renamed Columns: ['unnamed: 0', 'movie_id', 'title', 'year', 'genres', 'overview', 'director', 'cast']

📂 File: /content/Top_10000_Movies.csv
Original Columns: ['unnamed: 0', 'id', 'original_language', 'original_title', 'popularity', 'release_date', 'vote_average', 'vote_count', 'genre', 'overview', 'revenue', 'runtime', 'tagline']
Renamed Columns: ['unnamed: 0', 'id', 'original_language', 'title', 'popularity', 'release_date', 'vote_average', 'vote_count', 'genres', 'overview', 'revenue', 'runtime', 'tagline']

📂 File: /content/tmdb_5000_movies.csv
Original Columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average'