In [None]:
# from tqdm.notebook import tqdm
# import pandas as pd
# 
# def load_data_with_progress(filename, chunk_size=100000):
#     """Loads a CSV file in chunks with a progress bar."""
#     chunks = []
#     with tqdm(total=100, desc=f"Reading {filename}") as pbar:
#         for chunk in pd.read_csv(filename, sep='\t', compression='gzip', chunksize=chunk_size, low_memory=False):
#             chunks.append(chunk)
#             pbar.update(1)
#     return pd.concat(chunks, ignore_index=True)
# 
# def merge_data_with_progress(dataframes, on_column):
#     """Merges multiple DataFrames with a progress bar."""
#     with tqdm(total=len(dataframes) - 1, desc="Merging DataFrames") as pbar:
#         merged_data = dataframes[0]
#         for df in dataframes[1:]:
#             merged_data = pd.merge(merged_data, df, on=on_column)
#             pbar.update(1)
#     return merged_data
# 
# # Load the IMDB data files
# title_basics = load_data_with_progress('imdb/title.basics.tsv.gz')
# title_crew = load_data_with_progress('imdb/title.crew.tsv.gz')
# name_basics = load_data_with_progress('imdb/name.basics.tsv.gz') # If needed
# title_ratings = pd.read_csv('imdb/title.ratings.tsv.gz', sep='\t', compression='gzip', low_memory=False)
# 
# title_akas = load_data_with_progress('imdb/title.akas.tsv.gz')
# title_akas.rename(columns={'titleId': 'tconst'}, inplace=True)
# 
# us_movies = title_akas[title_akas['region'] == 'US']
# # Merge the loaded data into a single DataFrame
# dataframes_to_merge = [title_basics, title_ratings, title_crew, name_basics, us_movies]
# merged_data = merge_data_with_progress(dataframes_to_merge, 'tconst')
# 
# dataframes_to_merge_titles = [title_basics, title_ratings, title_crew]
# merged_titles = merge_data_with_progress(dataframes_to_merge_titles, 'tconst')
# directors = merged_titles['directors'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('nconst')
# writers = merged_titles['writers'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('nconst')
# director_names = directors.reset_index().merge(name_basics, on='nconst')
# writer_names = writers.reset_index().merge(name_basics, on='nconst')
# merged_data = pd.merge(merged_titles, us_movies, on='tconst', how='inner')

from tqdm.notebook import tqdm
import pandas as pd

def load_data_with_progress(filename, chunk_size=100000):
    chunks = []
    with tqdm(total=100, desc=f"Reading {filename}") as pbar:
        for chunk in pd.read_csv(filename, sep='\t', compression='gzip', chunksize=chunk_size, low_memory=False):
            chunks.append(chunk)
            pbar.update(1)
    return pd.concat(chunks, ignore_index=True)

def merge_data_with_progress(dataframes, on_column):
    with tqdm(total=len(dataframes) - 1, desc="Merging DataFrames") as pbar:
        merged_data = dataframes[0]
        for df in dataframes[1:]:
            merged_data = pd.merge(merged_data, df, on=on_column)
            pbar.update(1)
    return merged_data

# Load the IMDB data files
title_basics = load_data_with_progress('imdb/title.basics.tsv.gz')
title_crew = load_data_with_progress('imdb/title.crew.tsv.gz')
name_basics = load_data_with_progress('imdb/name.basics.tsv.gz')
title_ratings = pd.read_csv('imdb/title.ratings.tsv.gz', sep='\t', compression='gzip', low_memory=False)

# Merge the title-related DataFrames
dataframes_to_merge_titles = [title_basics, title_ratings, title_crew]
merged_titles = merge_data_with_progress(dataframes_to_merge_titles, 'tconst')
# Replace '\N' with NaN
merged_data.replace('\\N', pd.NA, inplace=True)

# Convert columns to appropriate data types
merged_data['startYear'] = pd.to_numeric(merged_data['startYear'], errors='coerce')
merged_data['endYear'] = pd.to_numeric(merged_data['endYear'], errors='coerce')
merged_data['runtimeMinutes'] = pd.to_numeric(merged_data['runtimeMinutes'], errors='coerce')

# Optionally, filter adult content
merged_data = merged_data[merged_data['isAdult'] == '0']
# Drop the row with missing startYear
merged_data.dropna(subset=['startYear'], inplace=True)
# Fill endYear with a placeholder
merged_data['endYear'].fillna('Unknown', inplace=True)
# Fill missing runtimeMinutes with the median value
merged_data['runtimeMinutes'].fillna(merged_data['runtimeMinutes'].median(), inplace=True)
# Fill missing genres with "Unknown"
merged_data['genres'].fillna('Unknown', inplace=True)
# Fill missing directors and writers with "Unknown"
merged_data['directors'].fillna('Unknown', inplace=True)
merged_data['writers'].fillna('Unknown', inplace=True)

from sklearn.preprocessing import StandardScaler

# Select the numerical features
numerical_features = merged_data[['averageRating', 'numVotes']]  # Add more features as needed

# If you have categorical features like genres, encode them using one-hot encoding
encoded_genres = pd.get_dummies(merged_data['genres'])

# Concatenate numerical and encoded categorical features
X = pd.concat([numerical_features, encoded_genres], axis=1)

# Scale the features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Now, you can use X_scaled in your K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Convert 'runtimeMinutes' to numeric, handling any non-numeric values
merged_data['runtimeMinutes'] = pd.to_numeric(merged_data['runtimeMinutes'], errors='coerce')

# Select the numerical features
numerical_features = merged_data[['averageRating', 'numVotes', 'runtimeMinutes']].fillna(0)

# Split 'genres' into individual genres and then one-hot encode
genres_split = merged_data['genres'].str.get_dummies(sep=',')
encoded_genres = pd.get_dummies(genres_split)

# Concatenate numerical and encoded categorical features
X = pd.concat([numerical_features, encoded_genres], axis=1)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Assign cluster labels
merged_data['cluster'] = kmeans.labels_

# Analyze clusters - print the mean of each feature per cluster
for i in range(5):
    print(merged_data[merged_data['cluster'] == i][['averageRating', 'numVotes', 'runtimeMinutes']].mean())

# Example recommendation function
def recommend_movie(movie_title, top_n=5):
    filtered_movies = merged_data[merged_data['primaryTitle'] == movie_title]

    if filtered_movies.empty:
        print(f"No movie found with title {movie_title}")
        return []

    cluster = filtered_movies['cluster'].iloc[0]
    recommendations = merged_data[merged_data['cluster'] == cluster]

    # Sort recommendations by rating and number of votes
    recommendations = recommendations.sort_values(by=['averageRating', 'numVotes'], ascending=False)

    # Select top N recommendations
    top_recommendations = recommendations.head(top_n)

    # Prepare information for display
    recommended_movies = []
    for index, row in top_recommendations.iterrows():
        movie_info = {
            'title': row['primaryTitle'],
            'genre': row['genres'],
            'rating': row['averageRating'],
            'votes': row['numVotes'],
            # Add more details as needed
        }
        recommended_movies.append(movie_info)

    return recommended_movies

# Recommendation function based on cast member
def recommend_by_director(director_name, top_n=5):
    filtered_movies = merged_data[merged_data['directors'] != 'Unknown']
    filtered_movies = filtered_movies[filtered_movies['directors'].str.contains(director_name, case=False)]

    if filtered_movies.empty:
        print(f"No movies found directed by {director_name}")
        return []

    recommendations = filtered_movies.sort_values(by=['averageRating', 'numVotes'], ascending=False)

    # Select top N recommendations
    top_recommendations = recommendations.head(top_n)

    # Prepare information for display
    recommended_movies = []
    for index, row in top_recommendations.iterrows():
        movie_info = {
            'title': row['primaryTitle'],
            'genre': row['genres'],
            'rating': row['averageRating'],
            'votes': row['numVotes'],
            # Add more details as needed
        }
        recommended_movies.append(movie_info)

    return recommended_movies


# Recommendation function based on genre
def recommend_by_genre(genre, top_n=5):
    filtered_movies = merged_data[merged_data['genres'].str.contains(genre, case=False, na=False)]

    if filtered_movies.empty:
        print(f"No movies found in genre {genre}")
        return []

    recommendations = filtered_movies.sort_values(by=['averageRating', 'numVotes'], ascending=False)

    # Select top N recommendations
    top_recommendations = recommendations.head(top_n)

    # Prepare information for display
    recommended_movies = []
    for index, row in top_recommendations.iterrows():
        movie_info = {
            'title': row['primaryTitle'],
            'genre': row['genres'],
            'rating': row['averageRating'],
            'votes': row['numVotes'],
            # Add more details as needed
        }
        recommended_movies.append(movie_info)

    return recommended_movies





## Pre Processing and Merging

In [None]:
import matplotlib.pyplot as plt
merged_data['averageRating'].hist(bins=20)
plt.title('Distribution of Average Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.scatter(merged_data['averageRating'], merged_data['numVotes'])
plt.title('Scatter plot of Average Ratings vs Number of Votes')
plt.xlabel('Average Rating')
plt.ylabel('Number of Votes')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only the numerical columns
numerical_data = merged_data.select_dtypes(include=['int64', 'float64'])

# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()

# Create a heatmap
sns.heatmap(correlation_matrix, annot=True)
plt.show()


In [None]:
merged_data['genres'].value_counts().plot(kind='bar')
plt.title('Distribution of Genres')
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

# Select the numerical features
numerical_features = merged_data[['averageRating', 'numVotes']]  # Add more features as needed

# If you have categorical features like genres, encode them using one-hot encoding
encoded_genres = pd.get_dummies(merged_data['genres'])

# Concatenate numerical and encoded categorical features
X = pd.concat([numerical_features, encoded_genres], axis=1)

# Scale the features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Now, you can use X_scaled in your K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Convert 'runtimeMinutes' to numeric, handling any non-numeric values
merged_data['runtimeMinutes'] = pd.to_numeric(merged_data['runtimeMinutes'], errors='coerce')

# Select the numerical features
numerical_features = merged_data[['averageRating', 'numVotes', 'runtimeMinutes']].fillna(0)

# Split 'genres' into individual genres and then one-hot encode
genres_split = merged_data['genres'].str.get_dummies(sep=',')
encoded_genres = pd.get_dummies(genres_split)

# Concatenate numerical and encoded categorical features
X = pd.concat([numerical_features, encoded_genres], axis=1)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)


In [None]:
# recommend_movie("Blue Shadow Virus")
recommend_by_genre("Action")
# recommend_by_director("nm12197316")
# Print out the top 5 actors
# top_directors = merged_data['directors'].str.split(', ', expand=True).stack().value_counts().head(50)
# print(top_directors)
# merged_data.columns

In [None]:
all_movie_titles = merged_data['primaryTitle'].unique().tolist()

# Print the list of all movie titles
for title in all_movie_titles:
    print(title)