In [3]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = 'team38'

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

In [8]:
# spark.sql("SHOW DATABASES").show()
spark.sql("USE team38_projectdb").show()
# spark.sql("SHOW TABLES").show()
spark.sql("SELECT * FROM housing_data_part_buck LIMIT 2").show()

++
||
++
++

+-----+-----------+-----------------+----------------+----------------+---------------+-----+-----------+------------+---------------+----------------+-------------+-------------------+------------------+-----------+---------------+----------------------+-----------------------+---------+
|   id|      price|apartment_type_id|metro_station_id|minutes_to_metro|number_of_rooms| area|living_area|kitchen_area|apartment_floor|number_of_floors|renovation_id|apartment_type_name|metro_station_name|region_name|renovation_name|metro_station_latitude|metro_station_longitude|region_id|
+-----+-----------+-----------------+----------------+----------------+---------------+-----+-----------+------------+---------------+----------------+-------------+-------------------+------------------+-----------+---------------+----------------------+-----------------------+---------+
|14399|1.5528071E7|                1|              93|            16.0|            2.0|66.73|       33.9|        15.2

In [9]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Load the dataset
df = spark.sql("SELECT * FROM housing_data_part_buck")

# Split into train and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [10]:
# Identify categorical columns
categorical_columns = [
    'apartment_type_id', 'number_of_rooms', 'apartment_floor', 'number_of_floors', 
    'renovation_id', 'apartment_type_name', 'region_name', 'renovation_name'
]

# StringIndexer and OneHotEncoder for categorical columns
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index", handleInvalid="keep") for column in categorical_columns]
encoders = [OneHotEncoder(inputCol=column + "_index", outputCol=column + "_encoded") for column in categorical_columns]

# Assemble features
numeric_columns = [
    'id', 'minutes_to_metro', 'area', 'living_area', 'kitchen_area', 
    'metro_station_latitude', 'metro_station_longitude', 'region_id'
]

assembler_inputs = [column + "_encoded" for column in categorical_columns] + numeric_columns
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# Target variable
label_column = "price"

In [11]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)


In [12]:
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler])

# Fit the pipeline on the training data
pipeline_model = pipeline.fit(train_df)

# Transform the training and testing data
train_transformed = pipeline_model.transform(train_df).select("scaledFeatures", label_column)
test_transformed = pipeline_model.transform(test_df).select("scaledFeatures", label_column)