In [32]:
# ReleaseYearOnBudget.py
# Mason Sipe
#
# The purpose of this file is to show the analysis of the data on how the year of the release on the specific 
# movie may have had an impact on the budget.
# 
# This can be accomplished by calculating the average of each movie at a specific time period and comparing the budget of 
# each movie to that average to see how it compares to other entries within the dataset.

# need to import to use pyspark
from pyspark.sql import Row
 
# need to import for session creation
from pyspark.sql import SparkSession
 
# creating the session
spark = SparkSession.builder.appName("ReleaseYearOnBudget").getOrCreate()

# read data from a specific file
df = spark.read.option("Header",True).option("InferSchema",True).csv("./Raw Data/movies.csv")

# print the schema showing that the file has been successfully loaded.

# df.printSchema()
# output of the last command:
# root
#  |-- name: string (nullable = true)
#  |-- rating: string (nullable = true)
#  |-- genre: string (nullable = true)
#  |-- year: integer (nullable = true)
#  |-- released: string (nullable = true)
#  |-- score: double (nullable = true)
#  |-- votes: double (nullable = true)
#  |-- director: string (nullable = true)
#  |-- writer: string (nullable = true)
#  |-- star: string (nullable = true)
#  |-- country: string (nullable = true)
#  |-- budget: double (nullable = true)
#  |-- gross: double (nullable = true)
#  |-- company: string (nullable = true)
#  |-- runtime: double (nullable = true)

# since the file has been successfully loaded, we want to get the different, unique generas throughout the file

#dfselected = df.select("released").show(truncate=False);

# This returns the raw rows exclusively for how they were released upon the specified dataset. 
# with this data we need to find the data about the budget that we have within the specific data

DFQ1 = df.select("released","budget");

# This query shows some unwanted data within the two rows. Outliers to be specific. Some movies did not have a record of a
# Budget which needs to be removed as it skews our data which is suboptimal.


dfSelected = DFQ1.na.drop(subset="budget").show();

+--------------------+---------+
|            released|   budget|
+--------------------+---------+
|June 13, 1980 (Un...|    1.9E7|
|July 2, 1980 (Uni...|4500000.0|
|June 20, 1980 (Un...|    1.8E7|
|July 2, 1980 (Uni...|3500000.0|
|July 25, 1980 (Un...|6000000.0|
|May 9, 1980 (Unit...| 550000.0|
|June 20, 1980 (Un...|    2.7E7|
|December 19, 1980...|    1.8E7|
|June 19, 1981 (Un...|    5.4E7|
|May 16, 1980 (Uni...|    1.0E7|
|December 17, 1980...|    1.5E7|
|October 26, 1984 ...|5000000.0|
|December 12, 1980...|    2.0E7|
|September 19, 198...|6000000.0|
|July 25, 1980 (Un...|6500000.0|
|October 3, 1980 (...|5100000.0|
|December 19, 1980...|    1.0E7|
|February 8, 1980 ...|1000000.0|
|February 15, 1980...|    1.1E7|
|April 24, 1981 (U...|    4.4E7|
+--------------------+---------+
only showing top 20 rows

