### Sample Analysis 2: List the titles that have been added at least 10 years ago
- Step 1. TYpe cast the `date_added` column from string to date type by creating a new temp column `date_formatted`
- Step 2. Inspect for `Null` values
- Step 3. Filter `Null` rows
- Step 4. Replace`date_added` column with the new `formatted_date` column and rename it back to the original column name
- Step 5. Calculate the year difference
- Step 6. Sort and display the results

In [None]:
from pyspark.sql.functions import col, \
                                from_unixtime, \
                                unix_timestamp, \
                                datediff, \
                                to_date, \
                                current_date, \
                                lit, \
                                round


In [None]:
# step1
df = spark.read.csv(
    path="dbfs:/cleaned-data/cleaned_netflix_titles.csv",
    header=True,
    sep="\t"
)

df.head(3)

Out[90]: [Row(show_id='80044126', title='D.L. Hughley: Clear', director='Jay Chapman', cast='D.L. Hughley', country='United States', date_added='July 13, 2017', release_year='2014', rating='TV-MA', duration='59 min', listed_in='Stand-Up Comedy', description='In this 2014 standup special filmed in San Francisco, comedic genius D.L. Hughley entertains with his hilarious take on current affairs and more.', type='Movie'),
 Row(show_id='80148179', title='My Scientology Movie', director='John Dower', cast='Louis Theroux', country='United Kingdom', date_added='July 13, 2017', release_year='2015', rating='TV-MA', duration='99 min', listed_in='Documentaries', description='After speaking with former Scientology members and being stonewalled by higher-ups, filmmaker Louis Theroux hires actors to re-create alleged events.', type='Movie'),
 Row(show_id='70301023', title='Tom Segura: Completely Normal', director='Jay Chapman', cast='Tom Segura', country='United States', date_added='July 13, 2017', r

In [None]:
# check the current date format
df.select('date_added').show(3)

+-------------+
|   date_added|
+-------------+
|July 13, 2017|
|July 13, 2017|
|July 13, 2017|
+-------------+
only showing top 3 rows



In [None]:
# step 1
str_to_unix_date = unix_timestamp(col('date_added'), 'MMMM d, yyyy')
final_unix_date = from_unixtime(str_to_unix_date, 'MM-dd-yyyy')

df.withColumn('formatted_date', final_unix_date) \
    .select('formatted_date', 'date_added').show(5)

+--------------+-------------+
|formatted_date|   date_added|
+--------------+-------------+
|    07-13-2017|July 13, 2017|
|    07-13-2017|July 13, 2017|
|    07-13-2017|July 13, 2017|
|    07-12-2019|July 12, 2019|
|    07-12-2019|July 12, 2019|
+--------------+-------------+
only showing top 5 rows



In [None]:
# overwrite df
df = df.withColumn('formatted_date', final_unix_date)

In [None]:
df.select('date_added', 'formatted_date').show()

+-------------+--------------+
|   date_added|formatted_date|
+-------------+--------------+
|July 13, 2017|    07-13-2017|
|July 13, 2017|    07-13-2017|
|July 13, 2017|    07-13-2017|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
|July 12, 2019|    07-12-2019|
+-------------+--------------+
only showing top 20 rows



In [None]:
# get null rows count
df.select('formatted_date').where('formatted_date IS NULL').count()

Out[95]: 14

In [None]:
df = df.filter(df.formatted_date != 'null')
df 

Out[96]: DataFrame[show_id: string, title: string, director: string, cast: string, country: string, date_added: string, release_year: string, rating: string, duration: string, listed_in: string, description: string, type: string, formatted_date: string]

In [None]:
# step 4

df = df.drop('date_added').withColumnRenamed('formatted_date', 'date_added')
df

Out[97]: DataFrame[show_id: string, title: string, director: string, cast: string, country: string, release_year: string, rating: string, duration: string, listed_in: string, description: string, type: string, date_added: string]

In [None]:
# step 5
unix_curr_date = unix_timestamp(current_date(), 'MM-dd-yyyy')
col_date = to_date(col('date_added'), 'MM-dd-yyyy')
diff = datediff(from_unixtime(unix_curr_date), col_date) / lit(365)

df.select(diff.alias('diff_in_years')).show(3)

+-----------------+
|    diff_in_years|
+-----------------+
|5.578082191780822|
|5.578082191780822|
|5.578082191780822|
+-----------------+
only showing top 3 rows



In [None]:
df.select('title', 'release_year', round(diff, 2).alias('diff_in_years'))   \
    .sort(col('diff_in_years'), ascending=False)    \
    .show()

+--------------------+------------+-------------+
|               title|release_year|diff_in_years|
+--------------------+------------+-------------+
|To and From New York|        2006|        15.12|
|     Dinner for Five|        2007|        15.02|
|Just Another Love...|        2007|        13.77|
|            Splatter|        2009|        13.23|
|Mad Ron's Prevues...|        1987|        12.28|
|       Even the Rain|        2010|        11.74|
|Joseph: King of D...|        2000|        11.38|
|Quiet Victory: Th...|        1988|        11.36|
|      Strange Voices|        1987|        11.36|
|Adam: His Song Co...|        1986|        11.36|
|The Ryan White Story|        1989|        11.36|
|        Hard Lessons|        1986|        11.36|
|In Defense of a M...|        1990|        11.36|
|A Stoning in Fulh...|        1988|        11.36|
|  Too Young the Hero|        1988|        11.36|
|Triumph of the Heart|        1991|        11.36|
|    Unspeakable Acts|        1990|        11.36|


In [None]:
# step 6
df.select('title', 'release_year', round(diff, 2).alias('diff_in_years'))   \
    .sort(col('diff_in_years'), ascending=False)    \
    .filter(col('diff_in_years') >= 10)  \
    .show()

+--------------------+------------+-------------+
|               title|release_year|diff_in_years|
+--------------------+------------+-------------+
|To and From New York|        2006|        15.12|
|     Dinner for Five|        2007|        15.02|
|Just Another Love...|        2007|        13.77|
|            Splatter|        2009|        13.23|
|Mad Ron's Prevues...|        1987|        12.28|
|       Even the Rain|        2010|        11.74|
|Joseph: King of D...|        2000|        11.38|
|A Stoning in Fulh...|        1988|        11.36|
|Adam: His Song Co...|        1986|        11.36|
|        Hard Lessons|        1986|        11.36|
|In Defense of a M...|        1990|        11.36|
|Quiet Victory: Th...|        1988|        11.36|
|      Strange Voices|        1987|        11.36|
|The Ryan White Story|        1989|        11.36|
|  Too Young the Hero|        1988|        11.36|
|Triumph of the Heart|        1991|        11.36|
|    Unspeakable Acts|        1990|        11.36|


In [None]:
c = df.select('title', 'release_year', round(diff, 2).alias('diff_in_years'))    \
            .filter(col('diff_in_years') >= 10) \
            .count()

c

Out[101]: 25