### Sample Analysis 1: Find the top 10 titles with the largest cast member population

##### Steps:
- 1. Create a df with cleaned data
- 2. Create a new column `cast_count` to hold cast count
- 3. Display the top 10 titles

In [0]:
# imports
import pyspark.sql.functions as F 

In [0]:
# step1
df = spark.read.csv(
    path="dbfs:/cleaned-data/cleaned_netflix_titles.csv",
    header=True,
    sep="\t"
)

df.head(3)

Out[4]: [Row(show_id='80044126', title='D.L. Hughley: Clear', director='Jay Chapman', cast='D.L. Hughley', country='United States', date_added='July 13, 2017', release_year='2014', rating='TV-MA', duration='59 min', listed_in='Stand-Up Comedy', description='In this 2014 standup special filmed in San Francisco, comedic genius D.L. Hughley entertains with his hilarious take on current affairs and more.', type='Movie'),
 Row(show_id='80148179', title='My Scientology Movie', director='John Dower', cast='Louis Theroux', country='United Kingdom', date_added='July 13, 2017', release_year='2015', rating='TV-MA', duration='99 min', listed_in='Documentaries', description='After speaking with former Scientology members and being stonewalled by higher-ups, filmmaker Louis Theroux hires actors to re-create alleged events.', type='Movie'),
 Row(show_id='70301023', title='Tom Segura: Completely Normal', director='Jay Chapman', cast='Tom Segura', country='United States', date_added='July 13, 2017', re

In [0]:
# step 2
df.withColumn('cast_count', F.size(F.split(df.cast, ',')))

Out[16]: DataFrame[show_id: string, title: string, director: string, cast: string, country: string, date_added: string, release_year: string, rating: string, duration: string, listed_in: string, description: string, type: string, cast_count: int]

In [0]:
df = df.withColumn('cast_count', F.size(F.split(df.cast, ',')))

# display the films with cast_count over 40
display(df.filter(df.cast_count > 40).show())

+--------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+------+----------+--------------------+--------------------+-------+----------+
| show_id|               title|            director|                cast|             country|       date_added|release_year|rating|  duration|           listed_in|         description|   type|cast_count|
+--------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+------+----------+--------------------+--------------------+-------+----------+
|70202138|    Arthur Christmas|         Sarah Smith|James McAvoy, Hug...|United Kingdom, U...|  January 1, 2020|        2011|    PG|    98 min|Children & Family...|At Santa Claus's ...|  Movie|        44|
|80151370|Michael Bolton's ...|Scott Aukerman, A...|Michael Bolton, A...|       United States| February 7, 2017|        2017| TV-MA|    54 min|Comedies, Music &...|After Santa tell

In [0]:
# step 3

df.select('title', 'cast_count') \
    .orderBy(df.cast_count.desc()) \
    .show(10, truncate=True)

+--------------------+----------+
|               title|cast_count|
+--------------------+----------+
|        Black Mirror|        50|
|         Creeped Out|        47|
|COMEDIANS of the ...|        47|
|    Arthur Christmas|        44|
|              Narcos|        42|
|Michael Bolton's ...|        41|
|Dolly Parton's He...|        41|
|American Horror S...|        40|
|Love, Death & Robots|        40|
|            Movie 43|        39|
+--------------------+----------+
only showing top 10 rows

