In [3]:
# Install Spark
!pip install pyspark
#!pip install pyspark[sql]

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=20479f488707faffb81f2ba9d10239d2345d12d1d262ef05123cdc0002691075
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


# Import Library

In [99]:
# import pyspark.sql classes and functions
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col, array_contains, isnan, when, count
from pyspark.sql.functions import lit, concat_ws, concat, collect_list, udf
from pyspark.sql.functions import countDistinct

spark = SparkSession.builder.appName("answers").getOrCreate()

In [100]:
!pwd

/kaggle/working


# Loading Data

In [101]:
#path = "/content/sample_data/california_housing_train.csv"

path = "/kaggle/input/california-housing-train"
df = spark.read.option("header",'True').option('delimiter', ',').csv(path)
df.printSchema()

root
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- housing_median_age: string (nullable = true)
 |-- total_rooms: string (nullable = true)
 |-- total_bedrooms: string (nullable = true)
 |-- population: string (nullable = true)
 |-- households: string (nullable = true)
 |-- median_income: string (nullable = true)
 |-- median_house_value: string (nullable = true)



In [102]:
schema = StructType() \
      .add("longitude",DoubleType(),True) \
      .add("latitude",DoubleType(),True) \
      .add("housing_median_age",DoubleType(),True) \
      .add("total_rooms",DoubleType(),True) \
      .add("total_bedrooms",DoubleType(),True) \
      .add("population",DoubleType(),True) \
      .add("households",DoubleType(),True) \
      .add("median_income",DoubleType(),True) \
      .add("median_house_value",DoubleType(),True) \

df = spark.read.format("csv") \
      .option("header", True) \
      .option('delimiter', ',') \
      .schema(schema) \
      .load("/kaggle/input/california-housing-train")
     # .load("/content/sample_data/california_housing_train.csv")

df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



## Problem 1, Question 1

In [103]:
df.show(truncate=False)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|-114.31  |34.19   |15.0              |5612.0     |1283.0        |1015.0    |472.0     |1.4936       |66900.0           |
|-114.47  |34.4    |19.0              |7650.0     |1901.0        |1129.0    |463.0     |1.82         |80100.0           |
|-114.56  |33.69   |17.0              |720.0      |174.0         |333.0     |117.0     |1.6509       |85700.0           |
|-114.57  |33.64   |14.0              |1501.0     |337.0         |515.0     |226.0     |3.1917       |73400.0           |
|-114.57  |33.57   |20.0              |1454.0     |326.0         |624.0     |262.0     |1.925        |65500.0           |
|-114.58  |33.63   |29.0

In [104]:
from pyspark.sql.functions import avg
df.groupBy("longitude","latitude").agg(avg("households")).show()

+---------+--------+-----------------+
|longitude|latitude|  avg(households)|
+---------+--------+-----------------+
|  -116.09|   34.15|           1482.0|
|  -116.31|   33.66|            713.0|
|   -116.9|   33.22|            703.0|
|  -116.96|   32.86|            503.0|
|  -116.97|   32.76|            406.0|
|  -116.99|   33.77|           1689.0|
|  -117.03|   33.18|            830.0|
|  -117.19|   33.69|           1074.0|
|   -117.2|   34.24|            423.0|
|  -117.54|   33.82|             28.0|
|  -117.24|   33.77|           1290.0|
|  -117.35|   33.69|            190.0|
|  -117.51|   34.14|            327.0|
|   -117.6|   33.45|            435.0|
|  -117.61|   33.42|            775.0|
|  -117.69|    33.6|390.6666666666667|
|  -117.73|   34.07|498.3333333333333|
|  -117.82|    33.8|            595.0|
|  -117.96|   33.71|            518.0|
|   -118.0|   34.15|            648.0|
+---------+--------+-----------------+
only showing top 20 rows



## Problem 2, Question 2

In [105]:
import time
import pandas as pd
import numpy as np
#record start time ; 

In [106]:
start = time.time()
df.groupBy("longitude","latitude").agg(avg("households")).show()
end = time.time()
time_spark = end - start
print("The time taken by Spark is: ", time_spark)

+---------+--------+-----------------+
|longitude|latitude|  avg(households)|
+---------+--------+-----------------+
|  -116.09|   34.15|           1482.0|
|  -116.31|   33.66|            713.0|
|   -116.9|   33.22|            703.0|
|  -116.96|   32.86|            503.0|
|  -116.97|   32.76|            406.0|
|  -116.99|   33.77|           1689.0|
|  -117.03|   33.18|            830.0|
|  -117.19|   33.69|           1074.0|
|   -117.2|   34.24|            423.0|
|  -117.54|   33.82|             28.0|
|  -117.24|   33.77|           1290.0|
|  -117.35|   33.69|            190.0|
|  -117.51|   34.14|            327.0|
|   -117.6|   33.45|            435.0|
|  -117.61|   33.42|            775.0|
|  -117.69|    33.6|390.6666666666667|
|  -117.73|   34.07|498.3333333333333|
|  -117.82|    33.8|            595.0|
|  -117.96|   33.71|            518.0|
|   -118.0|   34.15|            648.0|
+---------+--------+-----------------+
only showing top 20 rows

The time taken by Spark is:  0.3425865

In [107]:
df_pandas = pd.read_csv("/kaggle/input/california-housing-train/california_housing_train.csv")
df_pandas

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,66900
1,-114.47,34.40,19,7650,1901,1129,463,1.8200,80100
2,-114.56,33.69,17,720,174,333,117,1.6509,85700
3,-114.57,33.64,14,1501,337,515,226,3.1917,73400
4,-114.57,33.57,20,1454,326,624,262,1.9250,65500
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52,2217,394,907,369,2.3571,111400
16996,-124.27,40.69,36,2349,528,1194,465,2.5179,79000
16997,-124.30,41.84,17,2677,531,1244,456,3.0313,103600
16998,-124.30,41.80,19,2672,552,1298,478,1.9797,85800


In [108]:
start = time.time()
df_pandas.groupby(['longitude', 'latitude'])['households'].mean()
end = time.time()
time_pandas = end - start
print("The time taken by pandas is : ", time_pandas)

The time taken by pandas is :  0.006796121597290039


In [109]:
if(time_pandas > time_spark):
    print("Time taken by pandas is more.")
elif(time_pandas == time_spark):
    print("Time taken by spark and pandas is equal.")
else:
    print("Time taken by spark is more.")

Time taken by spark is more.


## Problem 3, Question 3

In [110]:
max_med_income = df.agg({'median_income': 'max'})
max_med_income.show()


+------------------+
|max(median_income)|
+------------------+
|           15.0001|
+------------------+



In [111]:
df2 = df.filter(df['median_income'] >= 15.0001)
df2.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -117.23|   32.99|              17.0|     2718.0|         326.0|    1011.0|     319.0|      15.0001|          500001.0|
|  -117.87|   33.62|              15.0|     2209.0|         275.0|     735.0|     274.0|      15.0001|          500001.0|
|  -118.04|   34.13|              35.0|      249.0|          31.0|     268.0|      29.0|      15.0001|          500001.0|
|  -118.06|   33.72|              14.0|     2665.0|         331.0|     964.0|     319.0|      15.0001|          500001.0|
|  -118.12|   34.12|              52.0|     2907.0|         317.0|     956.0|     279.0|      15.0001|          500001.0|
|  -118.18|   34.19|    

In [112]:
df_final = df2.select('longitude','latitude').show()

+---------+--------+
|longitude|latitude|
+---------+--------+
|  -117.23|   32.99|
|  -117.87|   33.62|
|  -118.04|   34.13|
|  -118.06|   33.72|
|  -118.12|   34.12|
|  -118.18|   34.19|
|  -118.19|   34.19|
|  -118.32|   34.06|
|  -118.33|   34.07|
|  -118.33|   34.06|
|  -118.34|   34.08|
|  -118.34|   33.76|
|  -118.39|   34.08|
|   -118.4|   34.11|
|   -118.4|    34.1|
|   -118.4|   34.09|
|   -118.4|   34.08|
|  -118.41|   34.07|
|  -118.42|   34.08|
|  -118.42|   34.08|
+---------+--------+
only showing top 20 rows



## Problem 4, Question 4

In [113]:
#time taken by spark
start_time = time.time()
max_med_income = df.agg({'median_income': 'max'})
max_med_income.show()
df2 = df.filter(df['median_income'] >= 15.0001)
df_final = df2.select('longitude','latitude').show(50)
end_time = time.time()
time_spark = end_time - start_time
print("The time taken by Spark is: ", time_spark)

+------------------+
|max(median_income)|
+------------------+
|           15.0001|
+------------------+

+---------+--------+
|longitude|latitude|
+---------+--------+
|  -117.23|   32.99|
|  -117.87|   33.62|
|  -118.04|   34.13|
|  -118.06|   33.72|
|  -118.12|   34.12|
|  -118.18|   34.19|
|  -118.19|   34.19|
|  -118.32|   34.06|
|  -118.33|   34.07|
|  -118.33|   34.06|
|  -118.34|   34.08|
|  -118.34|   33.76|
|  -118.39|   34.08|
|   -118.4|   34.11|
|   -118.4|    34.1|
|   -118.4|   34.09|
|   -118.4|   34.08|
|  -118.41|   34.07|
|  -118.42|   34.08|
|  -118.42|   34.08|
|  -118.43|   34.08|
|  -118.44|   34.09|
|  -118.49|   34.06|
|  -118.49|   34.05|
|   -118.5|   34.05|
|  -121.59|   37.19|
|  -121.87|   37.46|
|  -121.96|   37.13|
|  -122.14|    37.5|
|  -122.18|   37.46|
|   -122.2|   37.44|
|  -122.21|   37.46|
|  -122.22|   37.44|
|  -122.22|    37.4|
|  -122.27|   37.43|
|  -122.36|   37.56|
|  -122.44|   37.79|
|   -122.5|   37.79|
+---------+--------+

The time ta

In [118]:
df_pandas

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,66900
1,-114.47,34.40,19,7650,1901,1129,463,1.8200,80100
2,-114.56,33.69,17,720,174,333,117,1.6509,85700
3,-114.57,33.64,14,1501,337,515,226,3.1917,73400
4,-114.57,33.57,20,1454,326,624,262,1.9250,65500
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52,2217,394,907,369,2.3571,111400
16996,-124.27,40.69,36,2349,528,1194,465,2.5179,79000
16997,-124.30,41.84,17,2677,531,1244,456,3.0313,103600
16998,-124.30,41.80,19,2672,552,1298,478,1.9797,85800


In [119]:
#time taken by pandas 
start_time = time.time()
max_med = df_pandas['median_income'].max()
df_final_pandas = df_pandas.loc[df_pandas['median_income'] == max_med]
df_final_pandas = df_final_pandas[['longitude','latitude']]
#print(df_final_pandas)
end_time = time.time()
time_pandas = end_time - start_time
print("The time taken by Pandas is: ", time_pandas)

The time taken by Pandas is:  0.0061283111572265625


In [120]:
if(time_pandas > time_spark):
    print("Time taken by pandas is more.")
elif(time_pandas == time_spark):
    print("Time taken by spark and pandas is equal.")
else:
    print("Time taken by spark is more.")

Time taken by spark is more.


In [None]:
#148333 -> passcode