In [62]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time
from pprint import pprint as pp
import sqlite3
import warnings
import numpy as np

warnings.filterwarnings('ignore')

# importing movie table
movies = pd.read_csv("movie_metadata.csv")
movies_large = pd.read_csv("movies_metadata.csv")

# create revenue column
movies["revenue"] = movies["gross"] - movies["budget"]

# create gross margin column
movies["gross_margin"] = movies["revenue"] / movies["gross"]

# success/failure column
#  defined as having a positive or negative gross margin
movies["successful"] = np.where(movies["gross_margin"] >= 0 , 1, 0)

# drop na values
movies = movies.dropna()

# decided to declare approved as PG. Not Rated, Passed and Unrated declared as 0 or null
movies["rating_numeric"] = movies["content_rating"].map({"G":1,"TV-G":1,"PG":2,"TV-PG":2,
                                                           "GP":2,"PG-13":3,"TV-14":3,"R":4,"TV-MA":4,
                                                           "M":4,"NC-17":5,"X":5,"Not Rated":0,"Approved":2,
                                                           "Unrated":0,"Passed":2})

# read in the second dataset and keep only the imdb id and the release date
movies_large = pd.read_csv("movies_metadata.csv")
movies_large = movies_large[["imdb_id", "release_date"]]
movies_large = movies_large.set_index("imdb_id")

In [63]:
# creating a star power score for director, actor 1, actor 2, and actor 3
#  identify how many times a name is in each column
#  find the average amount of times a name is in a column
#  divide the amount of times a name is in a column by the avg amount

# find director star power
directors = movies["director_name"].value_counts()
directors = directors.to_frame()
directoravg = directors["director_name"].mean()
directors["director_score"] = directors["director_name"] / directoravg
directors = directors[["director_score"]]


# actor 1 star power
actor1 = movies["actor_1_name"].value_counts()
actor1 = actor1.to_frame()
actor1avg = actor1["actor_1_name"].mean()
actor1["actor_1_score"] = actor1["actor_1_name"] / actor1avg
actor1 = actor1[["actor_1_score"]]

# actor 2 star power
actor2 = movies["actor_2_name"].value_counts()
actor2 = actor2.to_frame()
actor2avg = actor2["actor_2_name"].mean()
actor2["actor_2_score"] = actor2["actor_2_name"] / actor2avg
actor2 = actor2[["actor_2_score"]]

# actor 3 star power
actor3 = movies["actor_3_name"].value_counts()
actor3 = actor3.to_frame()
actor3avg = actor3["actor_3_name"].mean()
actor3["actor_3_score"] = actor3["actor_3_name"] / actor3avg
actor3 = actor3[["actor_3_score"]]


movies = movies.merge(directors, left_on="director_name", right_index=True)
movies = movies.merge(actor1, left_on="actor_1_name", right_index=True)
movies = movies.merge(actor2, left_on="actor_2_name", right_index=True)
movies = movies.merge(actor3, left_on="actor_3_name", right_index=True)


# pull the imdb code from the imdb link
movies["imdb_num_code"] = movies["movie_imdb_link"].str.split("/title/").str[1]
movies["imdb_num_code"] = movies["imdb_num_code"].str.split("\/\?ref").str[0]

movies = movies.merge(movies_large, left_on="imdb_num_code", 
                      right_index=True, how="inner")

# function to add additional definitions of success
#  may or may not be used
def success_key(x):
    if x >= .4:
        return "extreme success"
    elif x >= .3: 
        return "high success"
    elif x >= .2: 
        return "average success"
    elif x >= .1:
        return "moderate success"
    elif x >= .0:
        return "low success"
    else:
        return "no success"
    
movies["success_bins"] = movies["gross_margin"].apply(success_key)
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,gross_margin,successful,rating_numeric,director_score,actor_1_score,actor_2_score,actor_3_score,imdb_num_code,release_date,success_bins
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,0.688365,1,3,3.091853,1.520767,1.747604,2.066294,tt0499549,2009-12-10,extreme success
1127,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,-3.037175,0,4,1.766773,3.041534,5.825346,2.066294,tt0118956,1998-01-30,no success
1699,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,-1.359958,0,3,1.32508,8.364217,0.582535,2.066294,tt0402399,2005-12-25,no success
747,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,0.181842,1,2,2.65016,9.884984,1.747604,2.066294,tt0393735,2006-03-09,moderate success
510,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,0.239011,1,2,1.32508,2.661342,1.747604,2.066294,tt1397514,2012-01-19,average success


In [None]:
# CURRENTLY NOT WORKING....
# MEMORY LEAK AROUND LINE 2800

# retrieve the actual release date of every movie
# function to visit each provided url
#  scrape the release date
#  return it to be stored in a column
def get_release_date(url):
    data = requests.get(url).text
    time.sleep(2)
    soup = bs(data, "lxml")
    print(url)
    try:
        return soup.find(title="See more release dates").text.strip()
    except:
        print("no release date.")

# using the above function to create a release_date column
movies["release_date"] = movies["movie_imdb_link"].apply(get_release_date)

In [37]:
# store dataframe in a sqlite db
con = sqlite3.connect("db/movies.db")
movies.to_sql("movie_data", con, if_exists="replace", index=False)
# commit the changes and close the connection
con.commit()
con.close()


# read data from db into dataframe
con = sqlite3.connect("db/movies.db")

sql = f"""
   SELECT * FROM movie_data
   """

movie_data = pd.read_sql(sql, con)
movie_data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,revenue
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,523505847.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,9404152.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,-44925825.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,198130642.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,
