<a href="https://colab.research.google.com/github/pullz6/Netflix-Show-Classification-Pyspark/blob/main/Netflix_Shows_Classification_with_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Setting up the pyspark
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
#Installing Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Unpack Spark from google drive
!tar xzf /content/drive/MyDrive/spark-3.3.0-bin-hadoop3.tgz
# Set up environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.3.0-bin-hadoop3"
# Install findspark, which helps python locate the psyspark module files
!pip install -q findspark
import findspark
findspark.init()
# Finally, we initialse a "SparkSession", which handles the computations
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

Mounted at /content/drive


In [2]:
usersCsvPath = "/content/drive/MyDrive/sample_data/Projects/Netflix-Data/netflix_titles.csv"
fix_netflix_data = pd.read_csv(usersCsvPath)

In [3]:
fix_netflix_data.date_added.dtype

dtype('O')

In [4]:
fix_netflix_data[['month', 'rest']] = fix_netflix_data['date_added'].str.split(n=1, expand=True)
fix_netflix_data[['day', 'year']] = fix_netflix_data['rest'].str.split(n=1, expand=True)
fix_netflix_data['day'] = fix_netflix_data['day'].str[:-1]

In [5]:
fix_netflix_data['combined'] = fix_netflix_data['day'].astype(str) + '_' + fix_netflix_data['month'] + '_' + fix_netflix_data['year']

In [6]:
fix_netflix_data['combined'] = fix_netflix_data['combined'].str.replace('_'," ")

In [7]:
fix_netflix_data.drop(['date_added','month','rest','day','year'],inplace=True,axis=1)

In [8]:
fix_netflix_data.rename({'combined':'date_added'},inplace=True,axis=1)

In [9]:
fix_netflix_data

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description,date_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Septembe 25 2021
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Septembe 24 2021
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Septembe 24 2021
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Septembe 24 2021
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Septembe 24 2021
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",Novembe 20 2019
8803,s8804,TV Show,Zombie Dumb,,,,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",Jul 1 2019
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,Novembe 1 2019
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",Januar 11 2020


In [None]:
fix_netflix_data.to_csv('/content/drive/MyDrive/sample_data/Projects/Netflix-Data/netflix_titles.csv', index=False)

In [20]:
### Load the California Housing Prices Dataset
# .read, .option, .csv

usersCsvPath = "/content/drive/MyDrive/sample_data/Projects/Netflix-Data/netflix_titles.csv"

netflixDF = (spark
             .read
             .option('header', True)
             .option('inferSchema', True)
             .csv(usersCsvPath))

netflixDF.show(5)
netflixDF.printSchema()

+-------+-------+--------------------+---------------+--------------------+-------------+------------+------+---------+--------------------+--------------------+-----------------+
|show_id|   type|               title|       director|                cast|      country|release_year|rating| duration|           listed_in|         description|       date_added|
+-------+-------+--------------------+---------------+--------------------+-------------+------------+------+---------+--------------------+--------------------+-----------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|        2020| PG-13|   90 min|       Documentaries|As her father nea...|25 September 2021|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|24 September 2021|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|        202

In [21]:
import pyspark.sql.functions as F
netflixDF = netflixDF.withColumn("parsed_date_column", F.to_timestamp(F.col("date_added"), "dd MMMM yyyy"))
netflixDF.show(10)

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------+------+---------+--------------------+--------------------+-----------------+-------------------+
|show_id|   type|               title|            director|                cast|             country|release_year|rating| duration|           listed_in|         description|       date_added| parsed_date_column|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------+------+---------+--------------------+--------------------+-----------------+-------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                null|       United States|        2020| PG-13|   90 min|       Documentaries|As her father nea...|25 September 2021|2021-09-25 00:00:00|
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|        2021| TV-MA|2 Seasons|International TV ...|

In [22]:

cols = ("title","director","cast","description","date_added")

netflixDF.drop(*cols)

DataFrame[show_id: string, type: string, country: string, release_year: string, rating: string, duration: string, listed_in: string, parsed_date_column: timestamp]

In [23]:
netflixDF.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------+------+---------+--------------------+--------------------+-----------------+-------------------+
|show_id|   type|               title|       director|                cast|      country|release_year|rating| duration|           listed_in|         description|       date_added| parsed_date_column|
+-------+-------+--------------------+---------------+--------------------+-------------+------------+------+---------+--------------------+--------------------+-----------------+-------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|        2020| PG-13|   90 min|       Documentaries|As her father nea...|25 September 2021|2021-09-25 00:00:00|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|24 September 2021|2021-09-24 00:00:00|
