In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, max, min

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Big Data Analysis").config("spark.some.config.option", "some-value").getOrCreate()

In [3]:
# Load Large Dataset (Example: CSV File)
df = spark.read.csv("C:/Users/Prashik/OneDrive/Desktop/ml/Heart.csv", header=True, inferSchema=True)

In [4]:
df.show(5)

+---+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
|_c0|Age|Sex|   ChestPain|RestBP|Chol|Fbs|RestECG|MaxHR|ExAng|Oldpeak|Slope| Ca|      Thal|AHD|
+---+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
|  1| 63|  1|     typical|   145| 233|  1|      2|  150|    0|    2.3|    3|  0|     fixed| No|
|  2| 67|  1|asymptomatic|   160| 286|  0|      2|  108|    1|    1.5|    2|  3|    normal|Yes|
|  3| 67|  1|asymptomatic|   120| 229|  0|      2|  129|    1|    2.6|    2|  2|reversable|Yes|
|  4| 37|  1|  nonanginal|   130| 250|  0|      0|  187|    0|    3.5|    3|  0|    normal| No|
|  5| 41|  0|  nontypical|   130| 204|  0|      2|  172|    0|    1.4|    1|  0|    normal| No|
+---+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
only showing top 5 rows



In [5]:
df.show()

+---+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
|_c0|Age|Sex|   ChestPain|RestBP|Chol|Fbs|RestECG|MaxHR|ExAng|Oldpeak|Slope| Ca|      Thal|AHD|
+---+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
|  1| 63|  1|     typical|   145| 233|  1|      2|  150|    0|    2.3|    3|  0|     fixed| No|
|  2| 67|  1|asymptomatic|   160| 286|  0|      2|  108|    1|    1.5|    2|  3|    normal|Yes|
|  3| 67|  1|asymptomatic|   120| 229|  0|      2|  129|    1|    2.6|    2|  2|reversable|Yes|
|  4| 37|  1|  nonanginal|   130| 250|  0|      0|  187|    0|    3.5|    3|  0|    normal| No|
|  5| 41|  0|  nontypical|   130| 204|  0|      2|  172|    0|    1.4|    1|  0|    normal| No|
|  6| 56|  1|  nontypical|   120| 236|  0|      0|  178|    0|    0.8|    1|  0|    normal| No|
|  7| 62|  0|asymptomatic|   140| 268|  0|      2|  160|    0|    3.6|    3|  2|    normal|Yes|
|  8| 57|  0|asymptomatic|   120| 354|  

In [6]:
# Show Schema
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- ChestPain: string (nullable = true)
 |-- RestBP: integer (nullable = true)
 |-- Chol: integer (nullable = true)
 |-- Fbs: integer (nullable = true)
 |-- RestECG: integer (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExAng: integer (nullable = true)
 |-- Oldpeak: double (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Ca: string (nullable = true)
 |-- Thal: string (nullable = true)
 |-- AHD: string (nullable = true)



In [7]:
# Basic Data Summary
df.describe().show()

+-------+-----------------+-----------------+-------------------+------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+----------+----+
|summary|              _c0|              Age|                Sex|   ChestPain|            RestBP|              Chol|               Fbs|           RestECG|             MaxHR|              ExAng|           Oldpeak|             Slope|                Ca|      Thal| AHD|
+-------+-----------------+-----------------+-------------------+------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+----------+----+
|  count|              303|              303|                303|         303|               303|               303|               303|               303|               303|                303|      

In [8]:
# Data Cleaning: Removing null values
df_clean = df.dropna()

In [9]:
# Aggregation Example: Group by a categorical column Age and compute averages
agg_df = df_clean.groupBy("Age").agg(count("Age").alias("count"),avg("Age").alias("average"),max("Age").alias("max"),min("Age").alias("min"))

In [10]:
# Show Aggregated Data
agg_df.show()

+---+-----+-------+---+---+
|Age|count|average|max|min|
+---+-----+-------+---+---+
| 65|    8|   65.0| 65| 65|
| 53|    8|   53.0| 53| 53|
| 34|    2|   34.0| 34| 34|
| 76|    1|   76.0| 76| 76|
| 44|   11|   44.0| 44| 44|
| 47|    5|   47.0| 47| 47|
| 52|   13|   52.0| 52| 52|
| 40|    3|   40.0| 40| 40|
| 57|   17|   57.0| 57| 57|
| 54|   16|   54.0| 54| 54|
| 48|    7|   48.0| 48| 48|
| 64|   10|   64.0| 64| 64|
| 41|   10|   41.0| 41| 41|
| 43|    8|   43.0| 43| 43|
| 37|    2|   37.0| 37| 37|
| 61|    8|   61.0| 61| 61|
| 35|    4|   35.0| 35| 35|
| 59|   14|   59.0| 59| 59|
| 55|    8|   55.0| 55| 55|
| 39|    4|   39.0| 39| 39|
+---+-----+-------+---+---+
only showing top 20 rows



In [11]:
# Aggregation Example: Group by a categorical column Sex and compute averages
agg_df = df_clean.groupBy("Sex").agg(count("Sex").alias("count"),avg("Sex").alias("average"),max("Sex").alias("max"),min("Sex").alias("min"))

In [12]:
# Show Aggregated Data
agg_df.show()

+---+-----+-------+---+---+
|Sex|count|average|max|min|
+---+-----+-------+---+---+
|  1|  206|    1.0|  1|  1|
|  0|   97|    0.0|  0|  0|
+---+-----+-------+---+---+



In [13]:
# Aggregation Example: Group by a categorical column RestBP and compute averages
agg_df = df_clean.groupBy("RestBP").agg(count("RestBP").alias("count"),avg("RestBP").alias("average"),max("RestBP").alias("max"),min("RestBP").alias("min"))

In [14]:
# Show Aggregated Data
agg_df.show()

+------+-----+-------+---+---+
|RestBP|count|average|max|min|
+------+-----+-------+---+---+
|   148|    2|  148.0|148|148|
|   155|    1|  155.0|155|155|
|   108|    6|  108.0|108|108|
|   115|    3|  115.0|115|115|
|   101|    1|  101.0|101|101|
|   126|    3|  126.0|126|126|
|   192|    1|  192.0|192|192|
|   128|   12|  128.0|128|128|
|   122|    4|  122.0|122|122|
|   140|   32|  140.0|140|140|
|   132|    8|  132.0|132|132|
|   152|    5|  152.0|152|152|
|   146|    2|  146.0|146|146|
|   142|    3|  142.0|142|142|
|   178|    2|  178.0|178|178|
|   164|    1|  164.0|164|164|
|    94|    2|   94.0| 94| 94|
|   120|   37|  120.0|120|120|
|   117|    1|  117.0|117|117|
|   154|    1|  154.0|154|154|
+------+-----+-------+---+---+
only showing top 20 rows



In [15]:
# Stop Spark Session
spark.stop()