### Sample Analysis 2: Filter out TV shows to a seperate DF
- Step 1. Filter out df based on `duration` column using keyword`seasons`
- Step 2. Create columns `seasons` to display `aggregated` season count of title
- Step 3. Save the df to csv

In [0]:
from pyspark.sql.functions import col, count, split
from pyspark.sql.types import IntegerType

In [0]:
# step 0
df = spark.read.csv(
    path="dbfs:/cleaned-data/cleaned_netflix_titles.csv",
    header=True,
    sep="\t"
)

df.head(3)

Out[1]: [Row(show_id='80044126', title='D.L. Hughley: Clear', director='Jay Chapman', cast='D.L. Hughley', country='United States', date_added='July 13, 2017', release_year='2014', rating='TV-MA', duration='59 min', listed_in='Stand-Up Comedy', description='In this 2014 standup special filmed in San Francisco, comedic genius D.L. Hughley entertains with his hilarious take on current affairs and more.', type='Movie'),
 Row(show_id='80148179', title='My Scientology Movie', director='John Dower', cast='Louis Theroux', country='United Kingdom', date_added='July 13, 2017', release_year='2015', rating='TV-MA', duration='99 min', listed_in='Documentaries', description='After speaking with former Scientology members and being stonewalled by higher-ups, filmmaker Louis Theroux hires actors to re-create alleged events.', type='Movie'),
 Row(show_id='70301023', title='Tom Segura: Completely Normal', director='Jay Chapman', cast='Tom Segura', country='United States', date_added='July 13, 2017', re

In [0]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)
 |-- type: string (nullable = true)



In [0]:
# query duration column 
df.select('duration').where('duration LIKE "%Seasons%"').show(5)

+---------+
| duration|
+---------+
|2 Seasons|
|5 Seasons|
|5 Seasons|
|2 Seasons|
|2 Seasons|
+---------+
only showing top 5 rows



In [0]:
# get a count of season titles
df.where('duration LIKE "%Season%"').count()

Out[7]: 1938

In [0]:
# step 1
seasons_df = df.where('duration LIKE "%Seasons%"')



+----------+--------+
|  duration|count(1)|
+----------+--------+
| 2 Seasons|     299|
| 3 Seasons|     157|
| 4 Seasons|      60|
| 5 Seasons|      46|
| 6 Seasons|      22|
| 7 Seasons|      21|
| 8 Seasons|      15|
| 9 Seasons|       7|
|10 Seasons|       3|
|11 Seasons|       3|
|13 Seasons|       2|
|15 Seasons|       2|
|12 Seasons|       2|
|14 Seasons|       1|
+----------+--------+



In [0]:
# find out the counts of each season title
seasons_df.groupBy('duration')  \
            .agg(count('*'))    \
            .sort('count(1)', ascending=False) \
            .show(30)

In [0]:
# step 2 

seasons_df.withColumn('season_volume', split('duration', ' ')[0].cast(IntegerType())).select('season_volume').show()

+-------------+
|season_volume|
+-------------+
|            2|
|            5|
|            5|
|            2|
|            2|
|            2|
|            2|
|            3|
|            7|
|            4|
|            8|
|            5|
|            2|
|            2|
|            3|
|            3|
|            2|
|            2|
|            4|
|            2|
+-------------+
only showing top 20 rows



In [0]:
seasons_df = seasons_df.withColumn('season_volume', split('duration', ' ')[0].cast(IntegerType()))

In [0]:
# drop columns no longer needed
seasons_df = seasons_df.drop("duration", "type").withColumnRenamed(
    "season_volume", "duration"
)

In [0]:
seasons_df.show(5)

+--------+--------------------+--------+--------------------+--------------------+-----------------+------------+--------+--------------------+--------------------+--------------+
| show_id|               title|director|                cast|             country|       date_added|release_year|  rating|           listed_in|         description|season_volumne|
+--------+--------------------+--------+--------------------+--------------------+-----------------+------------+--------+--------------------+--------------------+--------------+
|80136796|   Show Me the Money|    null|Afdlin Shauki, Pa...|                null|September 9, 2018|        2004|   TV-14|International TV ...|Four ordinary fam...|             2|
|70270745|       Ripper Street|    null|Matthew Macfadyen...|United Kingdom, I...|September 8, 2017|        2017|   TV-MA|British TV Shows,...|It's been six mon...|             5|
|70177067|               Haven|    null|Emily Rose, Lucas...|Canada, United St...|September 8, 2015|

In [0]:
# step 3
seasons_df.coalesce(1).write.csv(
    path="/cleaned-data/seasons_only_titles.csv", mode="overwrite", sep="\t", header=True
)