# 2021152

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
import os

In [16]:

def initialize_spark(app_name="Data Analysis"):
    return SparkSession.builder.appName(app_name).getOrCreate()

def read_dataset(spark, dataset_path):
    return spark.read.csv(dataset_path, header=True, inferSchema=True)

def analyze_data(dataframe):
    print("Dataset Overview:")
    dataframe.printSchema()
    print(f"\nTotal records in the dataset: {dataframe.count()}")

    print("\nContent Types and their Counts:")
    dataframe.groupBy("type").agg(count("*").alias("count")).show()

    print("\nYearly Content Trends:")
    dataframe.groupBy("release_year", "type") \
            .agg(count("*").alias("count")) \
            .orderBy("release_year", "type") \
            .show(20)

    print("\nTop Content-Producing Countries:")
    dataframe.groupBy("country") \
            .agg(count("*").alias("content_count")) \
            .orderBy(col("content_count").desc()) \
            .limit(10) \
            .show()

    print("\nRatings Distribution:")
    dataframe.groupBy("rating") \
            .agg(count("*").alias("rating_count")) \
            .orderBy(col("rating_count").desc()) \
            .show()




In [17]:
spark_session = initialize_spark()

        
    

In [18]:
# Define dataset path
data_path = 'netflix_titles.csv'  # Relative path to the dataset

# Load Netflix data
netflix_data = read_netflix_data(spark_session, data_path)

# Perform analysis
analyze_netflix_data(netflix_data)

1. Dataset Overview:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total rows in dataset: 8809

2. Distribution of Content Types:
+-------------+-----+
|         type|total|
+-------------+-----+
|         NULL|    1|
|      TV Show| 2676|
|        Movie| 6131|
|William Wyler|    1|
+-------------+-----+


3. Content Release Trends by Year:
+-----------------+-------+-------------+
|     release_year|   type|total_content|
+-----------------+-------+-------------+
|             NULL|   NULL|            1|
|             NULL|  Movie|            1|
|   Charles

In [19]:
spark_session.stop()
