In [49]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# load dataset

In [144]:
movie_df = pd.read_csv('../data/tmdb_5000_movies.csv')
credits_df = pd.read_csv('../data/tmdb_5000_credits.csv')

In [145]:
df = pd.merge(movie_df, credits_df, left_on = 'id', right_on = 'movie_id')

# data exploration

In [146]:
# Shape of the dataset
shape = df.shape

# Data types of the columns
data_types = df.dtypes

# Check for missing values
missing_values = df.isnull().sum()

# Descriptive statistics of the numerical columns
descriptive_stats = df.describe()

# Count of unique values for each column
# unique_counts = df.nunique()

missing_values

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title_x                    0
vote_average               0
vote_count                 0
movie_id                   0
title_y                    0
cast                       0
crew                       0
dtype: int64

In [147]:
shape, data_types, missing_values, descriptive_stats
# unique_counts

((4803, 24),
 budget                    int64
 genres                   object
 homepage                 object
 id                        int64
 keywords                 object
 original_language        object
 original_title           object
 overview                 object
 popularity              float64
 production_companies     object
 production_countries     object
 release_date             object
 revenue                   int64
 runtime                 float64
 spoken_languages         object
 status                   object
 tagline                  object
 title_x                  object
 vote_average            float64
 vote_count                int64
 movie_id                  int64
 title_y                  object
 cast                     object
 crew                     object
 dtype: object,
 budget                     0
 genres                     0
 homepage                3091
 id                         0
 keywords                   0
 original_language          0

In [149]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')

# Data Cleaning

In [150]:
df.drop(columns=['original_title', 'tagline', 'spoken_languages', 'status', 'movie_id' ], inplace=True)
df.drop_duplicates(inplace=True)


Dealing with JSON Fields

In [None]:
# Parsing the genres column from JSON format
df['genres'] = df['genres'].apply(lambda x: [genre['name'] for genre in json.loads(x)])

df['keywords'] = df['keywords'].apply(lambda x: [key['name'] for key in json.loads(x)])

df['production_companies'] = df['production_companies'].apply(lambda x: [company['name'] for company in json.loads(x)])

df['production_countries'] = df['production_countries'].apply(lambda x: [country['name'] for country in json.loads(x)])

~ Standardize Text Data and datetime and Normalization and Scaling

In [155]:
df['title_x'] = df['title_x'].str.lower()  # Convert titles to lowercase

df['popularity'] = (df['popularity'] - df['popularity'].min()) / (df['popularity'].max() - df['popularity'].min())

df['release_date'] = pd.to_datetime(df['release_date'])

handling missing values

In [156]:
from sklearn.impute import SimpleImputer

# Impute missing values with median
imputer_med = SimpleImputer(strategy='median')
df['release_date'] = imputer_med.fit_transform(df[['release_date']])
df['runtime'] = imputer_med.fit_transform(df[['runtime']])

replacing overviews where is none

In [157]:
df.loc[df['id'] == 370980, 'overview'] = 'Following the rise of father Jorge Mario Bergoglio from his early life as a teacher in a Jesuit High School in Argentina, to archbishop and cardinal of Buenos Aires, until he was elected Pope of the Roman Catholic Church.'

df.loc[df['id'] == 459488, 'overview'] = 'An exploration of how singer and actor Frank Sinatra became one of the biggest stars of the 20th century while remaining, in his heart, a normal person.'

df = df[df['id'] != 292539]

In [159]:
# Define a function to check if the value contains 'http'
def has_http(value):
    if isinstance(value, str) and 'http' in value:
        return 1
    else:
        return 0

# Apply the function to the 'homepage' column and create a new column 'has_homepage'
df.loc[:, 'has_homepage'] = df['homepage'].apply(lambda x: has_http(x))