In [29]:
from pyspark.sql import SparkSession

In [23]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.csv("house_price_prediction_dataset.csv", header=True, inferSchema=True)

In [5]:
df.createOrReplaceTempView("houses")

##### SCOPE
Here I try to get the requests with spark methods and sql queries. The requests according to `house_price_simple_analysis.py` file:
01. Schema of the dataset.
02. Row count of the dataset.
03. Column count of the dataset.
04. Describes of descriptive stats.
05. Min of descriptive stats.
06. Max of descriptive stats.
07. Null control for some columns.
08. Fill null rows.
09. Bedrooms biggest than three.
10. Year built < 2010.
11. Location count.
12. Price order by location.
13. Price per square.
14. Order by price.
15. Price by area.
16. Adding has_garage column to dataset.<br><br><br>
***  
##### TODO

- [x] Schema of the dataset.
- [x] Row count of the dataset.
- [x] Column count of the dataset.
- [x] Describes of descriptive stats.
- [x] Min of descriptive stats.
- [x] Max of descriptive stats.
- [ ] Null control for some columns.
- [ ] Fill null rows.
- [ ] Bedrooms biggest than three.
- [ ] Year built < 2010.
- [ ] Location count.
- [ ] Price order by location.
- [ ] Price per square.
- [ ] Order by price.
- [ ] Price by area.
- [ ] Adding has_garage column to dataset.

In [12]:
schema_of_dataset_py = df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Area: integer (nullable = true)
 |-- Bedrooms: integer (nullable = true)
 |-- Bathrooms: integer (nullable = true)
 |-- Floors: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Condition: string (nullable = true)
 |-- Garage: string (nullable = true)
 |-- Price: integer (nullable = true)



In [13]:
schema_of_dataset_sql = spark.sql("DESCRIBE houses;").show()

+---------+---------+-------+
| col_name|data_type|comment|
+---------+---------+-------+
|       Id|      int|   NULL|
|     Area|      int|   NULL|
| Bedrooms|      int|   NULL|
|Bathrooms|      int|   NULL|
|   Floors|      int|   NULL|
|YearBuilt|      int|   NULL|
| Location|   string|   NULL|
|Condition|   string|   NULL|
|   Garage|   string|   NULL|
|    Price|      int|   NULL|
+---------+---------+-------+



In [33]:
count_of_dataset_py = df.count()
print("Row Count of the Dataset: ",count_of_dataset_py)

Row Count of the Dataset:  2000


In [19]:
count_of_dataset_sql = spark.sql(
    """
    SELECT COUNT(*) AS row_count FROM houses;
    """
    ).show()

+---------+
|row_count|
+---------+
|     2000|
+---------+



In [34]:
column_count_of_dataset_py = len(df.columns)
print("Column Count of the Dataset: ", column_count_of_dataset_py)

Column Count of the Dataset:  10


In [29]:
column_count_of_dataset_sql = spark.sql(
    """
    SELECT SIZE(ARRAY(*)) AS column_count FROM houses LIMIT 1;
    """
).show()

+------------+
|column_count|
+------------+
|          10|
+------------+



In [14]:
df.describe("Bedrooms", "Bathrooms", "Floors", "YearBuilt", "Price").show()

24/12/17 20:30:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------+------------------+------------------+-----------------+-----------------+
|summary|         Bedrooms|         Bathrooms|            Floors|        YearBuilt|            Price|
+-------+-----------------+------------------+------------------+-----------------+-----------------+
|  count|             2000|              2000|              2000|             2000|             2000|
|   mean|           3.0035|            2.5525|            1.9935|         1961.446|       537676.855|
| stddev|1.424606086344792|1.1089899365366986|0.8091879525618783|35.92669547458914|276428.8457191392|
|    min|                1|                 1|                 1|             1900|            50005|
|    max|                5|                 4|                 3|             2023|           999656|
+-------+-----------------+------------------+------------------+-----------------+-----------------+



In [22]:
spark.sql(
    """
    SELECT
        'Bedrooms' AS summary,
        COUNT(Bedrooms) AS count,
        AVG(Bedrooms) AS mean,
        STDDEV(Bedrooms) AS stddev,
        MIN(Bedrooms) AS min,
        MAX(Bedrooms) AS max
    FROM houses
    UNION
    SELECT
        'Bathrooms' AS summary,
        COUNT(Bathrooms) AS count,
        AVG(Bathrooms) AS mean,
        STDDEV(Bathrooms) AS stddev,
        MIN(Bathrooms) AS min,
        MAX(Bathrooms) AS max
    FROM houses
    UNION
    SELECT
        'Floors' AS summary,
        COUNT(Floors) AS count,
        AVG(Floors) AS mean,
        STDDEV(Floors) AS stddev,
        MIN(Floors) AS min,
        MAX(Floors) AS max
    FROM houses
    UNION
    SELECT
        'YearBuilt' AS summary,
        COUNT(YearBuilt) AS count,
        AVG(YearBuilt) AS mean,
        STDDEV(YearBuilt) AS stddev,
        MIN(YearBuilt) AS min,
        MAX(YearBuilt) AS max
    FROM houses
    UNION
    SELECT
        'Price' AS summary,
        COUNT(Price) AS count,
        AVG(Price) AS mean,
        STDDEV(Price) AS stddev,
        MIN(Price) AS min,
        MAX(Price) AS max
    FROM houses;
    """
).show()

+---------+-----+----------+------------------+-----+------+
|  summary|count|      mean|            stddev|  min|   max|
+---------+-----+----------+------------------+-----+------+
| Bedrooms| 2000|    3.0035| 1.424606086344792|    1|     5|
|Bathrooms| 2000|    2.5525|1.1089899365366986|    1|     4|
|   Floors| 2000|    1.9935|0.8091879525618783|    1|     3|
|YearBuilt| 2000|  1961.446| 35.92669547458914| 1900|  2023|
|    Price| 2000|537676.855| 276428.8457191392|50005|999656|
+---------+-----+----------+------------------+-----+------+



In [32]:
df.select("Bedrooms", "Bathrooms", "Floors", "YearBuilt", "Price").groupBy() \
  .min("Bedrooms", "Bathrooms", "Floors", "YearBuilt", "Price").show()

+-------------+--------------+-----------+--------------+----------+
|min(Bedrooms)|min(Bathrooms)|min(Floors)|min(YearBuilt)|min(Price)|
+-------------+--------------+-----------+--------------+----------+
|            1|             1|          1|          1900|     50005|
+-------------+--------------+-----------+--------------+----------+



In [33]:
spark.sql(
    """
    SELECT MIN(Bedrooms), MIN(Bathrooms), MIN(Floors), MIN(YearBuilt), MIN(Price) FROM houses;
    """
).show()

+-------------+--------------+-----------+--------------+----------+
|min(Bedrooms)|min(Bathrooms)|min(Floors)|min(YearBuilt)|min(Price)|
+-------------+--------------+-----------+--------------+----------+
|            1|             1|          1|          1900|     50005|
+-------------+--------------+-----------+--------------+----------+



In [34]:
df.select("Bedrooms", "Bathrooms", "Floors", "YearBuilt", "Price").groupBy() \
  .max("Bedrooms", "Bathrooms", "Floors", "YearBuilt", "Price").show()

+-------------+--------------+-----------+--------------+----------+
|max(Bedrooms)|max(Bathrooms)|max(Floors)|max(YearBuilt)|max(Price)|
+-------------+--------------+-----------+--------------+----------+
|            5|             4|          3|          2023|    999656|
+-------------+--------------+-----------+--------------+----------+



In [35]:
spark.sql(
    """
    SELECT MAX(Bedrooms), MAX(Bathrooms), MAX(Floors), MAX(YearBuilt), MAX(Price) FROM houses;
    """
).show()

+-------------+--------------+-----------+--------------+----------+
|max(Bedrooms)|max(Bathrooms)|max(Floors)|max(YearBuilt)|max(Price)|
+-------------+--------------+-----------+--------------+----------+
|            5|             4|          3|          2023|    999656|
+-------------+--------------+-----------+--------------+----------+



In [40]:
df.filter(df["Id"].isNull()).show()

+---+----+--------+---------+------+---------+--------+---------+------+-----+
| Id|Area|Bedrooms|Bathrooms|Floors|YearBuilt|Location|Condition|Garage|Price|
+---+----+--------+---------+------+---------+--------+---------+------+-----+
+---+----+--------+---------+------+---------+--------+---------+------+-----+

