In [None]:
# Create a spark session

!pip install pyspark


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Dataframe").getOrCreate()


In [None]:
# prompt: Read califorina housing test

df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(r"/content/sample_data/california_housing_test.csv")


In [None]:
# prompt: DataFrame.agg

# Get the average of 'housing_median_age'
average_housing_median_age = df.agg({"housing_median_age": "avg"}).collect()[0][0]

# Get the maximum of 'housing_median_age'
max_housing_median_age = df.agg({"housing_median_age": "max"}).collect()[0][0]

# Get the minimum of 'housing_median_age'
min_housing_median_age = df.agg({"housing_median_age": "min"}).collect()[0][0]

# Get the sum of 'housing_median_age'
sum_housing_median_age = df.agg({"housing_median_age": "sum"}).collect()[0][0]

# Print the results
print("Average housing median age:", average_housing_median_age)
print("Maximum housing median age:", max_housing_median_age)
print("Minimum housing median age:", min_housing_median_age)
print("Sum of housing median age:", sum_housing_median_age)


In [None]:
# prompt: DataFrame.approxQuantile

# Get the approximate quantiles of 'housing_median_age'
quantiles = df.approxQuantile("housing_median_age", [0.25, 0.5, 0.75], 0.1)

# Print the results
print("25th percentile:", quantiles[0])
print("50th percentile:", quantiles[1])
print("75th percentile:", quantiles[2])


In [None]:
# prompt: DataFrame.cache

df.cache()


In [None]:
# prompt: repartition

df = df.repartition(10)


In [None]:
# prompt: DataFrame.coalesce

# Coalesce the DataFrame to 5 partitions
df_coalesced = df.coalesce(5)

# Print the number of partitions
print("Number of partitions after coalesce:", df_coalesced.rdd.getNumPartitions())


In [None]:
# prompt: DataFrame.collect

# Collect the DataFrame
collected_df = df.collect()

# Print the collected data
for row in collected_df:
  #print(row)
  pass


In [None]:
# prompt: DataFrame.columns

df.columns


In [None]:
# prompt: DataFrame.corr

# Get the correlation val of the DataFrame
correlation_val= df.corr('total_rooms','median_income')

# Print the correlation val
print(correlation_val)


In [None]:
# prompt: DataFrame.count

# Get the number of rows in the DataFrame
number_of_rows = df.count()

# Print the number of rows
print("Number of rows:", number_of_rows)


In [None]:
# prompt: DataFrame.cov

# Calculate the covariance of 'total_rooms' and 'median_income'
covariance_total_rooms_median_income = df.cov('total_rooms', 'median_income')

# Print the covariance
print("Covariance of total_rooms and median_income:", covariance_total_rooms_median_income)


In [None]:
# prompt: createOrReplaceGlobalTempView

df.createOrReplaceGlobalTempView("global_temp_view")


In [None]:
df.describe()


In [None]:
# prompt: DataFrame.cube

# Create a cube of the DataFrame
df_cube = df.cube("housing_median_age", "total_rooms")

df_cube


In [None]:
# prompt: DataFrame.dropna

# Drop rows with null values in any column
df_without_nulls = df.dropna()

# Drop rows with null values only in specific columns
df_without_nulls_specific_cols = df.dropna(subset=["housing_median_age", "total_rooms"])

# Drop rows with null values in any column and fill null values with a specific value
df_with_filled_nulls = df.fillna(0)

# Drop rows with null values only in specific columns and fill null values with a specific value
df_with_filled_nulls_specific_cols = df.fillna(0, subset=["housing_median_age", "total_rooms"])

