In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import time
import json
import os


In [2]:
# Load the IMDb dataset
file_path = '/kaggle/input/full-imdb-dataset/data.csv'
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(f"Dataset Shape: {df.shape}")
print("\nColumns in the dataset:")
print(df.columns)

# Show a sample of the dataset to understand its structure
df.head()


Dataset Shape: (1035798, 7)

Columns in the dataset:
Index(['id', 'title', 'type', 'genres', 'averageRating', 'numVotes',
       'releaseYear'],
      dtype='object')


Unnamed: 0,id,title,type,genres,averageRating,numVotes,releaseYear
0,tt0000009,Miss Jerry,movie,Romance,5.4,215.0,1894.0
1,tt0000147,The Corbett-Fitzsimmons Fight,movie,"Documentary, News, Sport",5.2,542.0,1897.0
2,tt0000502,Bohemios,movie,,4.4,18.0,1905.0
3,tt0000574,The Story of the Kelly Gang,movie,"Action, Adventure, Biography",6.0,948.0,1906.0
4,tt0000591,The Prodigal Son,movie,Drama,5.7,29.0,1907.0


In [3]:
df['releaseYear'] = pd.to_numeric(df['releaseYear'], errors='coerce')

# Filter out movies released before 1980
df_filtered = df[df['releaseYear'] >= 1980]

# Display the shape of the filtered dataset
print(f"Dataset shape after filtering: {df_filtered.shape}")

# Display the first few rows of the filtered dataset
print(df_filtered.head())

Dataset shape after filtering: (717077, 7)
              id                        title   type  \
5500   tt0011801             Tötet nicht mehr  movie   
8526   tt0015414       La tierra de los toros  movie   
8779   tt0015724                Dama de noche  movie   
24375  tt0035423               Kate & Leopold  movie   
25338  tt0036606  Another Time, Another Place  movie   

                         genres  averageRating  numVotes  releaseYear  
5500              Action, Crime            NaN       NaN       2019.0  
8526                        NaN            5.4      17.0       2000.0  
8779    Drama, Mystery, Romance            6.3      31.0       1993.0  
24375  Comedy, Fantasy, Romance            6.4   90823.0       2001.0  
25338                Drama, War            6.4     366.0       1983.0  


In [4]:
df_filtered = df_filtered.drop(columns=['id'])

# Display the shape of the filtered dataset
print(f"Dataset shape after filtering: {df_filtered.shape}")

# Display the first few rows of the filtered dataset
print(df_filtered.head())

Dataset shape after filtering: (717077, 6)
                             title   type                    genres  \
5500              Tötet nicht mehr  movie             Action, Crime   
8526        La tierra de los toros  movie                       NaN   
8779                 Dama de noche  movie   Drama, Mystery, Romance   
24375               Kate & Leopold  movie  Comedy, Fantasy, Romance   
25338  Another Time, Another Place  movie                Drama, War   

       averageRating  numVotes  releaseYear  
5500             NaN       NaN       2019.0  
8526             5.4      17.0       2000.0  
8779             6.3      31.0       1993.0  
24375            6.4   90823.0       2001.0  
25338            6.4     366.0       1983.0  


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

# Ensure 'genres' column is properly processed (handle missing or malformed data)
df_filtered['genres'] = df_filtered['genres'].fillna('')  # Fill missing genres with an empty string

# Split genres by comma and apply MultiLabelBinarizer
genres_split = df_filtered['genres'].str.split(',')
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(genres_split)

# Create a DataFrame for the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Merge the genre columns back into the filtered dataset
df_processed = pd.concat([df_filtered, genre_df], axis=1)

# Display the updated DataFrame
print(df_processed.head())


                             title   type                    genres  \
5500              Tötet nicht mehr  movie             Action, Crime   
8526        La tierra de los toros  movie                             
8779                 Dama de noche  movie   Drama, Mystery, Romance   
24375               Kate & Leopold  movie  Comedy, Fantasy, Romance   
25338  Another Time, Another Place  movie                Drama, War   

       averageRating  numVotes  releaseYear        Adult   Adventure  \
5500             NaN       NaN       2019.0  0.0     0.0         0.0   
8526             5.4      17.0       2000.0  0.0     0.0         0.0   
8779             6.3      31.0       1993.0  0.0     0.0         0.0   
24375            6.4   90823.0       2001.0  0.0     0.0         0.0   
25338            6.4     366.0       1983.0  0.0     0.0         0.0   

        Animation  ...  News  Reality-TV  Romance  Sci-Fi  Short  Sport  \
5500          0.0  ...   0.0         0.0      0.0     0.0    0.0 

In [6]:
import random

# Function to filter by user-selected criteria or return random movies or most popular
def filter_movies(df, rating_range=None, type_filter=None, most_popular=False, num_suggestions=5, 
                  selected_genres=None, selected_years=None, selection_mode='default'):
    # Check for selection mode and process accordingly
    if selection_mode == 'random':
        # Randomly select movies based on applied filters
        if not any([rating_range, type_filter, selected_genres, selected_years]):
            # If no filters are applied, return purely random movies
            df = df.sample(n=num_suggestions, random_state=random.randint(1, 1000))  # Randomly select 'num_suggestions' movies
        else:
            # If filters are applied, use random selection from the filtered dataset
            if rating_range:
                rating_min = int(rating_range[0])
                df = df[df['averageRating'] >= rating_min]
            if type_filter:
                df = df[df['type'] == type_filter]
            if selected_genres:
                for genre in selected_genres:
                    df = df[df['genres'].str.contains(genre, case=False, na=False)]
            if selected_years:
                df = df[df['releaseYear'].isin(selected_years)]
            df = df.sample(n=num_suggestions, random_state=random.randint(1, 1000))  # Randomly select 'num_suggestions' from filtered results
    
    elif selection_mode == 'most_popular':
        # Filter by Most Popular (Top-voted based on numVotes)
        if not any([rating_range, type_filter, selected_genres, selected_years]):
            df = df.sort_values(by='numVotes', ascending=False).head(num_suggestions)
        else:
            # Apply the user's filters and sort by popularity
            if rating_range:
                rating_min = int(rating_range[0])
                df = df[df['averageRating'] >= rating_min]
            if type_filter:
                df = df[df['type'] == type_filter]
            if selected_genres:
                for genre in selected_genres:
                    df = df[df['genres'].str.contains(genre, case=False, na=False)]
            if selected_years:
                df = df[df['releaseYear'].isin(selected_years)]
            df = df.sort_values(by='numVotes', ascending=False).head(num_suggestions)
    
    else:  # Default behavior (no sorting by popularity)
        # Apply filters and return results without popularity-based sorting
        if rating_range:
            rating_min = int(rating_range[0])
            df = df[df['averageRating'] >= rating_min]
        
        if type_filter:
            df = df[df['type'] == type_filter]
        
        if selected_genres:
            for genre in selected_genres:
                df = df[df['genres'].str.contains(genre, case=False, na=False)]
        
        if selected_years:
            df = df[df['releaseYear'].isin(selected_years)]
        
        df = df.head(num_suggestions)
    
    return df

# Example of using the filter function
rating_range = "7+"  # User selects 7+ rating
type_filter = "movie"  # User selects 'movie'
most_popular = True  # User selects 'most popular'
num_suggestions = 5  # User selects top 5 suggestions
selected_genres = ['Action', 'Comedy']  # User selects multiple genres
selected_years = [2010, 2015, 2020]  # User selects multiple years
selection_mode = 'random'  # User selects random mode (can be 'random', 'most_popular', or 'default')

# Apply the filter with random mode
df_filtered_suggestions = filter_movies(df_processed, rating_range, type_filter, most_popular, num_suggestions, 
                                         selected_genres, selected_years, selection_mode)

# Display the filtered suggestions
print(df_filtered_suggestions[['title', 'averageRating', 'numVotes', 'genres', 'releaseYear']])


                                 title  averageRating  numVotes  \
502807       Aama Bhitare Kichhi Achhi            7.2      21.0   
802697                        Safe Bet            8.0      10.0   
799748               Bajrangi Bhaijaan            8.1  101701.0   
857750  Cast me! Die Show ihres Lebens            7.6      54.0   
500864                   The Mummerman            8.6      35.0   

                           genres  releaseYear  
502807      Action, Comedy, Drama       2010.0  
802697             Action, Comedy       2015.0  
799748  Action, Adventure, Comedy       2015.0  
857750             Action, Comedy       2015.0  
500864      Action, Comedy, Crime       2010.0  


In [7]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
