In [None]:
import re
import numpy as np
import pandas as pd
import re
import geopandas as gpd
import os

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 Preprocessing")
    .config("spark.driver.memory", '4g')
    .config("spark.executor.memory", '8g')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.parquet.writeLegacyFormat", 'true')
    .getOrCreate()
)

In [None]:
def regression_model(df,indexCols,categoricalCols,continuousCols, interactionCols, labelCol):
    """
    Creates a linear regression model for the specified input column types, interaction columns and label column
    """

    
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid = "skip")
                 for c in categoricalCols ]

    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    interactions = [
        Interaction(
            inputCols = [
            f'{col}_indexed_encoded'
            for col in interactionCol
        ],
            outputCol = '_'.join(interactionCol)
        ) 
        for interactionCol in interactionCols
    ]

    interactions += [
        Interaction(
            inputCols = ['LocationID_indexed_encoded', 'months_lapsed'],
            outputCol = 'LocID_months_lapsed'
        )
    ]

    interactions +=[
        Interaction(
            inputCols = ['hour_indexed_encoded', feature],
            outputCol = f'{feature}_hour'
        )
        for feature in ['prec', 'snowfall', 'skin_temp']
    ]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + [interaction.getOutputCol() for interaction in interactions]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(
        stages=
        indexers 
        + encoders 
        + interactions 
        + [assembler]
    )

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',F.col(labelCol))

    tf_sdf = data.select(indexCols +['features','label'])

    featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures").fit(tf_sdf)

    tf_sdf = featureIndexer.transform(tf_sdf)

    training_sdf = tf_sdf.where(
        F.col('datetime') < '2019-06-01'
    )

    testing_sdf = tf_sdf.where(
        F.col('datetime') >= '2019-06-01'
    )

    lr = LinearRegression(regParam=0.3)

    pipeline = Pipeline(stages=[featureIndexer, lr])
    
    return pipeline.fit(training_sdf).transform(testing_sdf)