In [1]:
# prompt: install pyspark

!apt-get install openjdk-8-jre -y
!pip install pyspark


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-8-jre is already the newest version (8u392-ga-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.


In [2]:
# Create a spark Session and Name the application

from pyspark.sql import SparkSession
from google.colab import output

spark =(
    SparkSession.builder
    .master('local')
    .appName("FirstApplication")
    .getOrCreate()
)

In [3]:
# View the active sessions

SparkSession.active()
SparkSession.getActiveSession()

In [4]:
# Create a new dataframe from spread sheet

california_dataframe = spark.read.csv(r'/content/sample_data/california_housing_test.csv',header=True)
california_dataframe_train = spark.read.csv(r'/content/sample_data/california_housing_train.csv',header=True)

In [5]:
# View content

california_dataframe.show()
california_dataframe_train.show()

+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000| 606.000000|     6.608500|     344700.000000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000| 277.000000|     3.599000|     176500.000000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000| 495.000000|     5.793400|     270500.000000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000|  11.000000|     6.135900|     330000.000000|
|-119.670000|36.330000|         19.000000|1241.000000|    244.000000| 850.000000| 237.000000|     2.937500|    

In [6]:
# List Columns

california_dataframe.columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [7]:
# List Datatypes

california_dataframe.dtypes

[('longitude', 'string'),
 ('latitude', 'string'),
 ('housing_median_age', 'string'),
 ('total_rooms', 'string'),
 ('total_bedrooms', 'string'),
 ('population', 'string'),
 ('households', 'string'),
 ('median_income', 'string'),
 ('median_house_value', 'string')]

In [8]:
# List Schema

california_dataframe.schema

StructType([StructField('longitude', StringType(), True), StructField('latitude', StringType(), True), StructField('housing_median_age', StringType(), True), StructField('total_rooms', StringType(), True), StructField('total_bedrooms', StringType(), True), StructField('population', StringType(), True), StructField('households', StringType(), True), StructField('median_income', StringType(), True), StructField('median_house_value', StringType(), True)])

In [9]:
# Print Schema

california_dataframe.printSchema()

root
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- housing_median_age: string (nullable = true)
 |-- total_rooms: string (nullable = true)
 |-- total_bedrooms: string (nullable = true)
 |-- population: string (nullable = true)
 |-- households: string (nullable = true)
 |-- median_income: string (nullable = true)
 |-- median_house_value: string (nullable = true)



In [10]:
# Show firt row

california_dataframe.show(1)

+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population|households|median_income|median_house_value|
+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000|606.000000|     6.608500|     344700.000000|
+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
only showing top 1 row



In [11]:
# Show first row in vertial format

california_dataframe.show(1,vertical=True)

-RECORD 0---------------------------
 longitude          | -122.050000   
 latitude           | 37.370000     
 housing_median_age | 27.000000     
 total_rooms        | 3885.000000   
 total_bedrooms     | 661.000000    
 population         | 1537.000000   
 households         | 606.000000    
 median_income      | 6.608500      
 median_house_value | 344700.000000 
only showing top 1 row



In [12]:
# Convert Dataframe in to pandas dataframe

california_pandas = california_dataframe.toPandas()

In [13]:
# Stats of Dataframe

california_pandas.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
unique,607.0,587.0,52.0,2215.0,1055.0,1802.0,1026.0,2578.0,1784.0
top,-118.21,34.02,52.0,907.0,314.0,870.0,273.0,15.0001,500001.0
freq,26.0,35.0,173.0,5.0,15.0,7.0,12.0,9.0,125.0


In [14]:
# Use pandas Functions

california_pandas.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [15]:
# Create temproary table using Dataframe

california_dataframe.createOrReplaceTempView("California")

In [16]:
# Use SQL commands

spark.sql("select count(*) from California").show()

+--------+
|count(1)|
+--------+
|    3000|
+--------+



In [17]:
# To view Spark UI

output.serve_kernel_port_as_window(4040, path='/jobs/index.html')

<IPython.core.display.Javascript object>