# IMDB Datasets

**Purpose:**

Working with "big" data in PySpark

**Data Source:**
    
https://datasets.imdbws.com

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; font-size:90%;}</style>"))

In [None]:
import sys

print('Python info', sys.version)

In [None]:
import os

print('This is the curent directory', os.getcwd())

In [None]:
import datetime

current_date = datetime.date.today()
current_time = datetime.datetime.now()

print("System date/time", current_time)
print('Current date', datetime.datetime.strftime(current_date, '%A %m/%d/%Y'))
print('Current time', datetime.datetime.strftime(current_time, '%I:%M:%S %p'))

In [None]:
import requests
from bs4 import BeautifulSoup

from pprint import pprint
import re

In [None]:
!pip install pyspark --upgrade

In [None]:
!pip install pyarrow --upgrade

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.appName("Spark SQL Query Dataframes").getOrCreate()
spark

In [None]:
#spark = SparkSession.builder.appName("IMDB_Movies")\
#.config("spark.sql.shuffle.partitions", "50")\
#.config("spark.driver.maxResultSize", "5g")\
#.config("Spark.sql.execution.arrow.enabled", "true")\
#.getOrCreate()

#spark

In [None]:
sc = spark.sparkContext

In [None]:
!pip install wget

In [None]:
import wget

import gzip
import shutil
import csv

In [None]:
URL = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [None]:
print(URL)

In [None]:
filename = wget.download(URL)

In [None]:
print(filename)

In [None]:
with gzip.open(filename, mode='r') as f_in, open(filename.replace('.gz',''), mode='wb') as f_out:
    shutil.copyfileobj(f_in, f_out)
    
uncompressed_file = filename.replace('.gz','')

print(uncompressed_file)

In [None]:
# JSON
#df = sc.read.json('fp.json')

# TXT
#df = sc.read.text('fp.txt')

# CSV
#df = sc.read_csv('fp.csv')

In [None]:
spark_df0 = spark.read.format("csv")\
    .option("header","true")\
    .option("sep","\t")\
    .option("encoding","utf-8")\
    .option("nullValues", r"\N")\
    .option("quote","")\
    .option("inferSchema","true")\
    .load(uncompressed_file)

spark_df0.show(truncate=False)

In [None]:
spark_df0.printSchema()

In [None]:
spark_df0.dtypes

In [None]:
spark_df0.columns

In [None]:
spark_df0.head()

In [None]:
spark_df0.take(15)

In [None]:
spark_df0.groupBy(['isAdult','titleType']).count().sort(['isAdult','titleType']).show()

In [None]:
spark_df0 = spark_df0.filter((spark_df0["isAdult"]!=1) & (spark_df0["titleType"] == 'movie'))

In [None]:
spark_df0.groupBy(['isAdult','titleType']).count().sort(['isAdult','titleType']).show()

In [None]:
URL = "https://datasets.imdbws.com/title.ratings.tsv.gz"
filename = wget.download(URL)

In [None]:
with gzip.open(filename, mode='r') as f_in, open(filename.replace('.gz',''), mode='wb') as f_out:
    shutil.copyfileobj(f_in, f_out)
    
uncompressed_file = filename.replace('.gz','')

print(uncompressed_file)

In [None]:
spark_df1 = spark.read.format("csv")\
    .option("header","true")\
    .option("sep","\t")\
    .option("encoding","utf-8")\
    .option("nullValues", r"\N")\
    .option("quote","")\
    .option("inferSchema","true")\
    .load(uncompressed_file)

spark_df1.show()

In [None]:
spark_df1.printSchema()

In [None]:
spark_df = spark_df0.join(spark_df1, on='tconst', how='inner')

In [None]:
spark_df.show(truncate=False)

In [None]:
spark_df.createOrReplaceTempView('imdb_data')

In [None]:
spark.sql("SELECT * FROM imdb_data LIMIT 10").show(truncate=False)

In [None]:
spark.sql("SELECT titleType, startYear, count(*) totals FROM imdb_data GROUP BY titleType, startYear ORDER BY 1, 2").show(truncate=False)

In [None]:
stmt = \
"""
SELECT tconst, titleType, primaryTitle, startYear, runtimeMinutes, averageRating, numVotes
FROM imdb_data
WHERE primaryTitle LIKE '%Star Wars%'
"""

spark.sql(stmt).show()

In [None]:
stmt = """
SELECT *
FROM imdb_data
WHERE
    titleType = 'movie' AND
    averageRating > 9 AND
    genres != '\\N' AND
    numVotes > 1000
ORDER BY
    averageRating DESC
"""

spark.sql(stmt).show(25)

In [None]:
stmt = """
SELECT *
FROM imdb_data
WHERE
    titleType = 'movie' AND
    averageRating >= 7 AND
    genres LIKE '%Action%' AND
    CAST(runtimeMinutes AS INT) > 30 AND
    CAST(startYear AS INT) BETWEEN 2015 and 2019   
ORDER BY
    startYear DESC, averageRating DESC
"""

spark.sql(stmt).show(n=50, truncate=False)

In [None]:
URL = 'https://m.imdb.com/chart/top'

response = requests.get(URL)
soup = BeautifulSoup(response.text, 'lxml')

soup.title

In [None]:
soup.h1

In [None]:
soup.h2

In [None]:
soup.h3

In [None]:
soup.h4

In [None]:
soup.find_all('a')

In [None]:
data = []

for link in soup.find_all("a"):
    data.append(link.get("href", 'Empty'))
    
text_data = " ".join(data)

In [None]:
movie_titles = re.findall(pattern="(tt\d+)", string=text_data)

pprint(set(movie_titles), compact=True, width=132)
print()
print(len(set(movie_titles)))

In [None]:
spark_top250 = sc.parallelize([Row(tconst=t) for t in set(movie_titles)]).toDF()

In [None]:
spark_df = spark_top250.join(spark_df0, on='tconst', how='inner')
spark_df = spark_df.join(spark_df1, on='tconst', how='inner')
spark_df.show()

In [None]:
spark_df.drop_duplicates().show(n=250, truncate=False)

In [None]:
import pandas as pd

In [None]:
pandas_top250 = spark_df.toPandas()

In [None]:
pandas_top250.info()

In [None]:
pandas_top250.head()

In [None]:
pandas_top250['runtimeMinutes'] = pd.to_numeric(pandas_top250['runtimeMinutes'])

In [None]:
pandas_top250.runtimeMinutes.plot.hist(alpha=.4);

In [None]:
pandas_top250.query("runtimeMinutes > 200")

In [None]:
pandas_top250.startYear.value_counts().sort_index().plot.bar(figsize=(14,4), alpha=.4);

In [None]:
pandas_top250['startYear'] = pandas_top250['startYear'].astype(int)

In [None]:
pandas_top250.query("startYear == 1995")

In [None]:
pandas_top250.plot.scatter(x='averageRating', y='numVotes', alpha=.5);

In [None]:
pandas_top250.query("averageRating > 9 & numVotes > 1.5e6")