In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
import os

In [12]:
def create_spark_session():
    
    return SparkSession.builder \
        .appName("Netflix Dataset EDA") \
        .getOrCreate()

def load_netflix_dataset(spark, file_path):
    
    return spark.read.csv(file_path, header=True, inferSchema=True)

def perform_eda(netflix_df):
    
    print("1. Basic Dataset Information:")
    netflix_df.printSchema()
    print(f"\nTotal number of records: {netflix_df.count()}")
    
    print("\n2. Content Type Distribution:")
    netflix_df.groupBy("type").agg(count("*").alias("count")).show()
    
    print("\n3. Yearly Content Production:")
    yearly_content = netflix_df.groupBy("release_year", "type") \
        .agg(count("*").alias("content_count")) \
        .orderBy("release_year", "type")
    yearly_content.show(20)
    
    print("\n4. Top 10 Countries Producing Content:")
    country_content = netflix_df.groupBy("country") \
        .agg(count("*").alias("content_count")) \
        .orderBy(col("content_count").desc()) \
        .limit(10)
    country_content.show()
    
    print("\n5. Rating Distribution:")
    rating_distribution = netflix_df.groupBy("rating") \
        .agg(count("*").alias("count")) \
        .orderBy(col("count").desc())
    rating_distribution.show()

In [13]:
spark = create_spark_session()

In [14]:
dataset_path = 'netflix_titles.csv'
netflix_df = load_netflix_dataset(spark, dataset_path)
perform_eda(netflix_df)

1. Basic Dataset Information:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total number of records: 8809

2. Content Type Distribution:
+-------------+-----+
|         type|count|
+-------------+-----+
|         NULL|    1|
|      TV Show| 2676|
|        Movie| 6131|
|William Wyler|    1|
+-------------+-----+


3. Yearly Content Production:
+-----------------+-------+-------------+
|     release_year|   type|content_count|
+-----------------+-------+-------------+
|             NULL|   NULL|            1|
|             NULL|  Movie|            1|
|   Charl

In [15]:
spark.stop()