In [12]:
# Imports
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# PySpark imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.stat
import Statistics


import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [13]:
# 'local[*]' means use all available cores in the local machine
spark = SparkSession \
    .builder \
    .appName("houseRegression") \
    .master('local[*]') \
    .getOrCreate()

spark

In [14]:
train_fname = r'./dataset/train.csv'
test_fname = r'./dataset/test.csv'

df = spark.read.csv(train_fname, header=True, inferSchema=True,nullValue='NA').repartition(4).persist()
df_test = spark.read.csv(test_fname, header=True, inferSchema=True, nullValue='NA').repartition(4).persist()

df.count(), df_test.count()


(1460, 1459)

In [31]:
df.select('SalePrice').summary("count", "mean", "stddev",
                               "min", "5%", "25%", "50%", "75%", "95%", "max").show()


+-------+------------------+
|summary|         SalePrice|
+-------+------------------+
|  count|              1460|
|   mean|180921.19589041095|
| stddev| 79442.50288288662|
|    min|             34900|
|     5%|             88000|
|    25%|            129900|
|    50%|            163000|
|    75%|            214000|
|    95%|            326000|
|    max|            755000|
+-------+------------------+



In [29]:
# Convert MSSubClass to a string because it is a categorical field
df = df.withColumn('MSSubClass', F.col('MSSubClass').cast('string'))
df_test = df_test.withColumn('MSSubClass', F.col('MSSubClass').cast('string'))

In [55]:
assembler = VectorAssembler(inputCols=df.columns,
outputCol="features",handleInvalid='keep')
df2 = assembler.transform(df).select("features")

# correlation will be in Dense Matrix
correlation = Correlation.corr(df2,"features","pearson").collect()[0][0]

# To convert Dense Matrix into DataFrame
rows = correlation.toArray().tolist()
df2 = spark.createDataFrame(rows,df.columns)

IllegalArgumentException: Data type string of column MSSubClass is not supported.
Data type string of column MSZoning is not supported.
Data type string of column Street is not supported.
Data type string of column Alley is not supported.
Data type string of column LotShape is not supported.
Data type string of column LandContour is not supported.
Data type string of column Utilities is not supported.
Data type string of column LotConfig is not supported.
Data type string of column LandSlope is not supported.
Data type string of column Neighborhood is not supported.
Data type string of column Condition1 is not supported.
Data type string of column Condition2 is not supported.
Data type string of column BldgType is not supported.
Data type string of column HouseStyle is not supported.
Data type string of column RoofStyle is not supported.
Data type string of column RoofMatl is not supported.
Data type string of column Exterior1st is not supported.
Data type string of column Exterior2nd is not supported.
Data type string of column MasVnrType is not supported.
Data type string of column ExterQual is not supported.
Data type string of column ExterCond is not supported.
Data type string of column Foundation is not supported.
Data type string of column BsmtQual is not supported.
Data type string of column BsmtCond is not supported.
Data type string of column BsmtExposure is not supported.
Data type string of column BsmtFinType1 is not supported.
Data type string of column BsmtFinType2 is not supported.
Data type string of column Heating is not supported.
Data type string of column HeatingQC is not supported.
Data type string of column CentralAir is not supported.
Data type string of column Electrical is not supported.
Data type string of column KitchenQual is not supported.
Data type string of column Functional is not supported.
Data type string of column FireplaceQu is not supported.
Data type string of column GarageType is not supported.
Data type string of column GarageFinish is not supported.
Data type string of column GarageQual is not supported.
Data type string of column GarageCond is not supported.
Data type string of column PavedDrive is not supported.
Data type string of column PoolQC is not supported.
Data type string of column Fence is not supported.
Data type string of column MiscFeature is not supported.
Data type string of column SaleType is not supported.
Data type string of column SaleCondition is not supported.