In [5]:
import pandas as pd

# Load the data
movies = pd.read_csv('../Datasets/movie.csv')
links = pd.read_csv('../Datasets/link.csv')

# Merge the data based on movieId
merged_data = pd.merge(movies, links, on='movieId')

# Drop the unnecessary columns
merged_data = merged_data.drop(['title'], axis=1)

# Rename the columns
merged_data = merged_data.rename(columns={'movieId': 'movie_id', 'imdbId': 'imdb_id', 'tmdbId': 'tmdb_id'})

# Save the preprocessed data
merged_data.to_csv('preprocessed_data.csv', index=False)

In [6]:
merged_data

Unnamed: 0,movie_id,genres,imdb_id,tmdb_id
0,1,Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Adventure|Children|Fantasy,113497,8844.0
2,3,Comedy|Romance,113228,15602.0
3,4,Comedy|Drama|Romance,114885,31357.0
4,5,Comedy,113041,11862.0
...,...,...,...,...
27273,131254,Comedy,466713,4436.0
27274,131256,Comedy,277703,9274.0
27275,131258,Adventure,3485166,285213.0
27276,131260,(no genres listed),249110,32099.0


In [3]:
import pandas as pd
import re

# Load the movie.csv and link.csv files into pandas dataframes
movie_df = pd.read_csv('../Datasets/movie.csv')
link_df = pd.read_csv('../Datasets/link.csv')

# Merge the two dataframes based on the movieId column
merged_df = pd.merge(movie_df, link_df, on='movieId')

# Define a regular expression pattern to remove unwanted characters from the title column
pattern = re.compile('[^a-zA-Z0-9 ]')

# Define a function to clean and preprocess the movie titles
def clean_title(title):
    # Remove unwanted characters from the title
    cleaned_title = pattern.sub('', title)
    # Replace multiple spaces with a single space
    cleaned_title = re.sub(' +', ' ', cleaned_title)
    # Capitalize the first letter of each word in the title
    cleaned_title = ' '.join(word.capitalize() for word in cleaned_title.split())
    return cleaned_title

# Apply the clean_title function to the title column of the merged dataframe
merged_df['title'] = merged_df['title'].apply(clean_title)

# Save the preprocessed dataframe as a new CSV file
merged_df.to_csv('preprocessed_movie_data.csv', index=False)

In [4]:
merged_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji 1995,Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men 1995,Comedy|Romance,113228,15602.0
3,4,Waiting To Exhale 1995,Comedy|Drama|Romance,114885,31357.0
4,5,Father Of The Bride Part Ii 1995,Comedy,113041,11862.0
...,...,...,...,...,...
27273,131254,Kein Bund Frs Leben 2007,Comedy,466713,4436.0
27274,131256,Feuer Eis Dosenbier 2002,Comedy,277703,9274.0
27275,131258,The Pirates 2014,Adventure,3485166,285213.0
27276,131260,Rentun Ruusu 2001,(no genres listed),249110,32099.0


In [None]:
import pandas as pd
import requests

# Load the preprocessed data
merged_data = pd.read_csv('preprocessed_data.csv')

# Set up the API key and parameters
api_key = 'c06cede91056be933f6b25e34556347f'

# Define a function to fetch movie details from the TMDB API
def fetch_movie_details(tmdb_id):
    # Make the API request
    response = requests.get(f'https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={api_key}')
    # Parse the JSON response
    movie_data = response.json()
    # Extract the relevant fields
    title = movie_data.get('original_title', '')
    rating = movie_data.get('vote_average', 0)
    user_score = movie_data.get('popularity', 0)
    overview = movie_data.get('overview', '')
    release_date = movie_data.get('release_date', '')
    poster_path = movie_data.get('poster_path', '')
    backdrop_path = movie_data.get('backdrop_path', '')
    runtime = movie_data.get('runtime', 0)
    tagline = movie_data.get('tagline', '')
    # Return the movie details as a dictionary
    return {
        'title': title,
        'rating': rating,
        'user_score': user_score,
        'overview': overview,
        'release_date': release_date,
        'poster_path': poster_path,
        'backdrop_path': backdrop_path,
        'runtime': runtime,
        'tagline': tagline,
    }

# Define a list to store the fetched movie details
movie_details = []

# Loop through the tmdb_id column in the merged_data dataframe
for tmdb_id in merged_data['tmdb_id']:
    # Fetch the movie details for the current tmdb_id
    movie_data = fetch_movie_details(tmdb_id)
    # Append the fetched movie details to the movie_details list
    movie_details.append(movie_data)

# Convert the movie_details list to a dataframe
movie_df = pd.DataFrame(movie_details)

# Merge the movie_df dataframe with the merged_data dataframe
merged_data = pd.merge(merged_data, movie_df, on='tmdb_id')