In [1]:
import pandas as pd
import gzip
import numpy as np
import sqlite3

# Unzip and read the CSV and TSV files
with gzip.open("zippedData/bom.movie_gross.csv.gz", 'rt', encoding='ISO-8859-1') as f:
    df_bom_movie_gross = pd.read_csv(f)

with gzip.open("zippedData/rt.movie_info.tsv.gz", 'rt', encoding='ISO-8859-1') as f:
    df_rt_movie_info = pd.read_csv(f, sep='\t')

with gzip.open("zippedData/rt.reviews.tsv.gz", 'rt', encoding='ISO-8859-1') as f:
    df_rt_reviews = pd.read_csv(f, sep='\t')

with gzip.open("zippedData/tmdb.movies.csv.gz", 'rt', encoding='ISO-8859-1') as f:
    df_tmdb_movies = pd.read_csv(f)

with gzip.open("zippedData/tn.movie_budgets.csv.gz", 'rt', encoding='ISO-8859-1') as f:
    df_tn_movie_budgets = pd.read_csv(f)

conn = sqlite3.connect("zippedData/im.db")
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(query).fetchall()

## Business Problem

In the history of the cinematography industry, remarkable movies such as 'Avatar' (2009), 'Avengers: Endgame' (2019), 'Titanic' (1997), among others, have achieved tremendous success, captivating global audiences and redefining filmmaking. However, in the last years and amplified by the impact of the COVID-19 pandemic, the industry has faced significant disruptions due to the surge in popular streaming services like Netflix, Amazon Prime Video, Disney+, Tubi (which is free), Hulu, and others. As a result, producing a movie that attracts audiences to cinemas has become more challenging than ever before. Creating a successful film today requires unprecedented rigor and adaptation to the changing landscape of movie consumption.

In [2]:
df_bom_movie_gross.year

0       2010
1       2010
2       2010
3       2010
4       2010
        ... 
3382    2018
3383    2018
3384    2018
3385    2018
3386    2018
Name: year, Length: 3387, dtype: int64

In [3]:
movie_basics = pd.read_sql_query("SELECT * FROM movie_basics", conn) # core
directors = pd.read_sql_query("SELECT * FROM directors", conn) # core
known_for = pd.read_sql_query("SELECT * FROM known_for", conn)
movie_akas = pd.read_sql_query("SELECT * FROM movie_akas", conn)
movie_ratings = pd.read_sql_query("SELECT * FROM movie_ratings", conn) # core
persons = pd.read_sql_query("SELECT * FROM persons", conn) # core
principals = pd.read_sql_query("SELECT * FROM principals", conn)
writers = pd.read_sql_query("SELECT * FROM writers", conn)

In [19]:
# Merging directors dataframe with persons dataframe to get director names.

directors_with_names = directors.merge(persons[['person_id', 'primary_name']], on='person_id', how='left')
directors_with_names = directors_with_names.rename(columns={'primary_name': 'directed_by'})

# Merging movie basics dataframe with director names.

master_df = movie_basics.drop(columns='original_title').rename(columns={'primary_title': 'title'})
master_df = master_df.merge(directors_with_names[['movie_id', 'person_id', 'directed_by']], on='movie_id', how='left')
master_df = master_df.drop_duplicates(subset='title')
master_df.head()

# Merging master dataframe with movie ratings on movie_id.

master_df = master_df.merge(movie_ratings[['movie_id', 'averagerating', 'numvotes']], on='movie_id', how='left')
master_df.head()

# Normalizing titles

master_df['title'] = master_df['title'].apply(lambda title: title.lower().strip())
df_bom_movie_gross['title'] = df_bom_movie_gross['title'].apply(lambda title: title.lower().strip())
df_bom_movie_gross['domestic_gross'] = df_bom_movie_gross['domestic_gross'].apply(
    lambda gross: float(str(gross).strip().replace(',', ''))
)
df_bom_movie_gross['foreign_gross'] = df_bom_movie_gross['foreign_gross'].apply(
    lambda gross: float(str(gross).strip().replace(',', ''))
)


# Merging master dataframe with box office dataframe.

master_df = master_df.merge(df_bom_movie_gross[['title', 'studio', 'domestic_gross', 'foreign_gross']], on='title', how='left')

# Converting gross values to floats.

master_df['domestic_gross'] = master_df['domestic_gross'].apply(lambda x: float(str(x).strip().replace(',', '')))
master_df['foreign_gross'] = master_df['foreign_gross'].apply(lambda x: float(str(x).strip().replace(',', '')))

# Prepping movie budgets dataframe to merge with master dataframe.

budgets = df_tn_movie_budgets.drop(columns=['production_budget', 'domestic_gross', 'worldwide_gross']).copy()

budgets[['production_budget', 'domestic_gross', 'worldwide_gross']] = df_tn_movie_budgets[
    ['production_budget', 'domestic_gross', 'worldwide_gross']
].applymap(
    lambda x: x.strip().replace('$', '').replace(',', '')
).astype('float').values

budgets['foreign_gross'] = budgets['worldwide_gross'] - budgets['domestic_gross']
budgets = budgets.drop(columns='worldwide_gross')
budgets = budgets.rename(columns={'movie': 'title'})
budgets['title'] = budgets['title'].apply(lambda title: title.lower().strip())

budgets = budgets.merge(df_bom_movie_gross[['title', 'studio', 'domestic_gross', 'foreign_gross']], on=['title', 'domestic_gross', 'foreign_gross'], how='left')

# Merging master dataframe with budgets dataframe
master_df

# domestic_growth_ = master_df['domestic_gross'].fillna(0)
# foreign_growth_ = master_df['foreign_gross'].fillna(0)
# master_df['total_gross'] = domestic_growth_ + foreign_growth_
# master_df.total_gross.value_counts()
# master_df['total_gross'] = master_df['total_gross'].replace(0, np.nan)
# master_df_ = master_df.dropna(subset=['total_gross', 'averagerating', 'total_profit'])
# master_df_

Unnamed: 0,movie_id,title,start_year,runtime_minutes,genres,person_id,directed_by,averagerating,numvotes,studio,domestic_gross,foreign_gross
0,tt0063540,sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,Harnam Singh Rawail,7.0,77.0,,,
1,tt0066787,one day before the rainy season,2019,114.0,"Biography,Drama",nm0002411,Mani Kaul,7.2,43.0,,,
2,tt0069049,the other side of the wind,2018,122.0,Drama,nm0000080,Orson Welles,6.9,4517.0,,,
3,tt0069204,sabse bada sukh,2018,,"Comedy,Drama",nm0611531,Hrishikesh Mukherjee,6.1,13.0,,,
4,tt0100275,the wandering soap opera,2017,80.0,"Comedy,Drama,Fantasy",nm0765384,Valeria Sarmiento,6.5,119.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
136067,tt9916538,kuambil lagi hatiku,2019,123.0,Drama,nm8185151,Azhar Kinoi Lubis,,,,,
136068,tt9916622,rodolpho teóphilo - o legado de um pioneiro,2015,,Documentary,nm9272490,Angela Gurgel,,,,,
136069,tt9916706,dankyavar danka,2013,,Comedy,nm7764440,Kanchan Nayak,,,,,
136070,tt9916730,6 gunn,2017,116.0,,nm10538612,Kiran Gawade,,,,,


In [5]:
b = budgets.set_index('title')
m = master_df.set_index('title')

In [6]:
pd.concat([b, m], axis=0).reset_index()

Unnamed: 0,title,id,release_date,production_budget,domestic_gross,foreign_gross,studio,movie_id,start_year,runtime_minutes,genres,person_id,directed_by,averagerating,numvotes
0,avatar,1.0,"Dec 18, 2009",425000000.0,760507625.0,2.015838e+09,,,,,,,,,
1,pirates of the caribbean: on stranger tides,2.0,"May 20, 2011",410600000.0,241063875.0,8.046000e+08,,,,,,,,,
2,dark phoenix,3.0,"Jun 7, 2019",350000000.0,42762350.0,1.070000e+08,,,,,,,,,
3,avengers: age of ultron,4.0,"May 1, 2015",330600000.0,459005868.0,9.440081e+08,,,,,,,,,
4,star wars ep. viii: the last jedi,5.0,"Dec 15, 2017",317000000.0,620181382.0,6.965404e+08,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141849,kuambil lagi hatiku,,,,,,,tt9916538,2019.0,123.0,Drama,nm8185151,Azhar Kinoi Lubis,,
141850,rodolpho teóphilo - o legado de um pioneiro,,,,,,,tt9916622,2015.0,,Documentary,nm9272490,Angela Gurgel,,
141851,dankyavar danka,,,,,,,tt9916706,2013.0,,Comedy,nm7764440,Kanchan Nayak,,
141852,6 gunn,,,,,,,tt9916730,2017.0,116.0,,nm10538612,Kiran Gawade,,


In [7]:
budgets

Unnamed: 0,id,release_date,title,production_budget,domestic_gross,foreign_gross,studio
0,1,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.015838e+09,
1,2,"May 20, 2011",pirates of the caribbean: on stranger tides,410600000.0,241063875.0,8.046000e+08,
2,3,"Jun 7, 2019",dark phoenix,350000000.0,42762350.0,1.070000e+08,
3,4,"May 1, 2015",avengers: age of ultron,330600000.0,459005868.0,9.440081e+08,
4,5,"Dec 15, 2017",star wars ep. viii: the last jedi,317000000.0,620181382.0,6.965404e+08,
...,...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",red 11,7000.0,0.0,0.000000e+00,
5778,79,"Apr 2, 1999",following,6000.0,48482.0,1.920130e+05,
5779,80,"Jul 13, 2005",return to the land of wonders,5000.0,1338.0,0.000000e+00,
5780,81,"Sep 29, 2015",a plague so pleasant,1400.0,0.0,0.000000e+00,


In [8]:
a = master_df.copy()
a.merge(budgets[['title', 'domestic_gross', 'foreign_gross']], on=)

SyntaxError: invalid syntax (<ipython-input-8-497b6b6c36d9>, line 2)

In [None]:
budgets

Unnamed: 0,id,release_date,title,production_budget,domestic_gross,foreign_gross,studio
0,1,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.015838e+09,
1,2,"May 20, 2011",pirates of the caribbean: on stranger tides,410600000.0,241063875.0,8.046000e+08,
2,3,"Jun 7, 2019",dark phoenix,350000000.0,42762350.0,1.070000e+08,
3,4,"May 1, 2015",avengers: age of ultron,330600000.0,459005868.0,9.440081e+08,
4,5,"Dec 15, 2017",star wars ep. viii: the last jedi,317000000.0,620181382.0,6.965404e+08,
...,...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",red 11,7000.0,0.0,0.000000e+00,
5778,79,"Apr 2, 1999",following,6000.0,48482.0,1.920130e+05,
5779,80,"Jul 13, 2005",return to the land of wonders,5000.0,1338.0,0.000000e+00,
5780,81,"Sep 29, 2015",a plague so pleasant,1400.0,0.0,0.000000e+00,


In [None]:
['title', 'production_budget', 'domestic_gross', 'foreign_gross', 'studio']

Unnamed: 0,id,release_date,title,production_budget,domestic_gross,foreign_gross,studio
0,1,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.015838e+09,
1,2,"May 20, 2011",pirates of the caribbean: on stranger tides,410600000.0,241063875.0,8.046000e+08,
2,3,"Jun 7, 2019",dark phoenix,350000000.0,42762350.0,1.070000e+08,
3,4,"May 1, 2015",avengers: age of ultron,330600000.0,459005868.0,9.440081e+08,
4,5,"Dec 15, 2017",star wars ep. viii: the last jedi,317000000.0,620181382.0,6.965404e+08,
...,...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",red 11,7000.0,0.0,0.000000e+00,
5778,79,"Apr 2, 1999",following,6000.0,48482.0,1.920130e+05,
5779,80,"Jul 13, 2005",return to the land of wonders,5000.0,1338.0,0.000000e+00,
5780,81,"Sep 29, 2015",a plague so pleasant,1400.0,0.0,0.000000e+00,


In [None]:
master_df.production_budget.sum()

0.0

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X = master_df_['averagerating'].values
y = master_df_['total_gross'].values

In [None]:
LinearRegression