## Data Understanding

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Sources


[**IMDb**](https://www.imdb.com) - 'Data/im.db' - (8 tables)

SQL database containing movie info and cast & crew details

In [None]:
# IMDb
conn = sqlite3.connect('../Data/im.db')
pd.read_sql("""                        
SELECT * FROM sqlite_master
WHERE type='table' 
""", conn)

[**The Numbers**](https://www.the-numbers.com) - 'Data/tn.movie_budgets.csv.gz' - (5782 rows x 6 cols)

production budget, domestic/worldwide gross revenues

In [None]:
# The Numbers
pd.read_csv('../Data/tn.movie_budgets.csv.gz').info()

[**Box Office Mojo**](https://www.boxofficemojo.com) - 'Data/bom.movie_gross.csv.gz' - (3387 rows x 5 columns)

additional info on studio, gross revenue


In [None]:
# Box Office Mojo
pd.read_csv('../Data/bom.movie_gross.csv.gz').info()


[**The Movie DB**](https://www.themoviedb.org) - 'Data/tmdb.movies.csv.gz' - (26517 rows x 10 cols)

additional info on genre, language, votes/popularity


In [None]:
# The Movie DB
pd.read_csv('../Data/tmdb.movies.csv.gz').info()

[**Rotten Tomatoes**](https://www.rottentomatoes.com) - 'Data/rt.movie_info.tsv.gz' - (1560 rows x 12 cols)

synopsis, rating, runtime, etc.

In [None]:
# Rotten Tomatoes - movie info
pd.read_csv('../Data/rt.movie_info.tsv.gz', sep='\t' ).info()


[**Rotten Tomatoes**](https://www.rottentomatoes.com) - 'Data/rt.reviews.tsv.gz' - (54432 rows x 8 cols)

additional info on reviews, ratings



In [None]:
# Rotten Tomatoes - reviews
pd.read_csv('../Data/rt.reviews.tsv.gz', sep='\t', encoding='latin-1').info()

### Data Cleaning

In [None]:
# IMDb - movie_basics
# 146,144 entries

pd.read_sql("SELECT * FROM movie_basics", conn).info()

In [None]:
# IMDb - movie_ratings
# 73,856 entries

pd.read_sql("SELECT * FROM movie_ratings", conn).info()

In [None]:
# IMDb - create dataframe combining tables 'movie_basics' and 'movie_ratings'
# https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV?ref_=helpms_helpart_inline#

imdb_df = pd.read_sql("""
SELECT primary_title, original_title, runtime_minutes, genres, start_year, averagerating, numvotes
FROM movie_basics 
JOIN movie_ratings
USING (movie_id)
""", conn)

In [None]:
# # IMDb !! v2 !! - create dataframe of 'movie_basics' 

# imdb_df = pd.read_sql("""
# SELECT primary_title, original_title, runtime_minutes, genres, start_year
# FROM movie_basics 
# """, conn)

In [None]:
# The Numbers - https://www.the-numbers.com/glossary
# 5,782 entries

pd.read_csv('../Data/tn.movie_budgets.csv.gz').info()

In [None]:
# The Numbers - create dataframe
roi_df = pd.read_csv('../Data/tn.movie_budgets.csv.gz')

# Convert 'release_date' to datetime and create 'year' column
roi_df['release_date'] = pd.to_datetime(roi_df['release_date'], errors='coerce')
roi_df['year'] = roi_df['release_date'].dt.year

# Define function to convert monetary columns to numeric
def convert_monetary_columns(df, columns):
    for column in columns:
        df[column] = pd.to_numeric(df[column].str.replace('[\$,]', '', regex=True), errors='coerce')
    return df
# Define list of columns and run function on database
monetary_columns = ['production_budget', 'domestic_gross', 'worldwide_gross']
roi_df = convert_monetary_columns(roi_df, monetary_columns)

# Create column 'ROI' 
roi_df['ROI'] = (roi_df['worldwide_gross'] - roi_df['production_budget']) / roi_df['production_budget'] * 100


In [None]:
# Merge imdb_df and roi_df
movie_df = pd.merge(imdb_df, roi_df, left_on=['primary_title', 'start_year'], right_on=['movie', 'year'], how='inner')

# Drop extraneous columns and rows with null values
movie_df = movie_df.drop(['original_title', 'start_year', 'id', 'release_date', 'movie', 'domestic_gross'], axis=1)
movie_df = movie_df.dropna()

# Round up and convert 'runtime_minutes' to integer
movie_df['runtime_minutes'] = movie_df['runtime_minutes'].round().astype(int)

# Round up and convert 'ROI' to integer and then sort by descending
movie_df['ROI'] = movie_df['ROI'].round().astype(int)
movie_df = movie_df.sort_values(by='ROI', ascending=False, ignore_index=True)

In [None]:
# Export clean data to CSV
movie_df.to_csv('../Data/movie_clean.csv', index=False)

movie_df = pd.read_csv('../Data/movie_clean.csv')
movie_df

In [None]:
# Split 'genres' into list of values and explode into rows for each genre
movie_df['genres'] = movie_df['genres'].str.split(',')
genres_df = movie_df.explode('genres')

In [None]:
# Export genres data to CSV
genres_df.to_csv('../Data/genres.csv', index=False)

genres_df = pd.read_csv('../Data/genres.csv')
genres_df