In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("taxi-analysis")\
    .config("spark.executer.memory", MAX_MEMORY)\
    .config("spark.drive.memory", MAX_MEMORY).getOrCreate()

23/02/08 19:32:46 WARN Utils: Your hostname, imhaneul-ui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 172.30.1.22 instead (on interface en0)
23/02/08 19:32:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/08 19:32:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/08 19:32:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
trip_files = "/Users/imhaneul/Documents/sky-laboratory/spark-distribute/data/trips/*"

In [4]:
trips_df = spark.read.parquet(f"file:///{trip_files}")

                                                                                

In [5]:
trips_df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [6]:
trips_df.createOrReplaceTempView("trips")

In [7]:
qs = """
SELECT
    passenger_count,
    PULocationID as pickup_location_id,
    DOLocationID as dropoff_location_id,
    trip_distance,
    HOUR(tpep_pickup_datetime) as pickup_time,
    DATE_FORMAT(tpep_pickup_datetime, 'EEEE') as day_of_week, 
    total_amount
FROM 
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""
data_df = spark.sql(qs)
data_df.createOrReplaceTempView("data")

In [8]:
data_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---------------+------------------+-------------------+-------------+-----------+-----------+------------+
|passenger_count|pickup_location_id|dropoff_location_id|trip_distance|pickup_time|day_of_week|total_amount|
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+
|            0.0|               138|                265|         16.5|          9|     Monday|       70.07|
|            1.0|                68|                264|         1.13|          9|     Monday|       11.16|
|            1.0|               239|                262|         2.68|          9|     Monday|       18.59|
|            1.0|               186|                 91|         12.4|          9|     Monday|        43.8|
|            2.0|               132|                265|          9.7|          9|     Monday|        32.3|
|            1.0|               138|                141|          9.3|          9|     Monday|       43.67|
|            1.0|           

                                                                                

In [9]:
data_df.printSchema()

root
 |-- passenger_count: double (nullable = true)
 |-- pickup_location_id: long (nullable = true)
 |-- dropoff_location_id: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [10]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=2023)

In [11]:
data_dir = "/Users/imhaneul/Documents/sky-laboratory/spark-distribute/data"

In [12]:
# train_df.write.format("parquet").save(f"{data_dir}/train/")
# test_df.write.format("parquet").save(f"{data_dir}/test/")

In [13]:
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

In [14]:
train_df.printSchema()

root
 |-- passenger_count: double (nullable = true)
 |-- pickup_location_id: long (nullable = true)
 |-- dropoff_location_id: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



### stringIndexer 를 이용해서 string -> int -> onehotencoing

In [15]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feature = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]
stages = []

for c in cat_feature:
    cat_indexer = StringIndexer(inputCol=c, outputCol=c+"_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c+"_onehot"])
    stages += [cat_indexer, onehot_encoder]

### VectorAssembler -> StandartScala  Scarla Normalization

In [16]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feature = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]
for n in num_feature:
    num_assembler = VectorAssembler(inputCols=[n], outputCol=n+"_vector")
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=n+"_scaled")
    stages += [num_assembler, num_scaler]

In [17]:
assembler_inputs = [c+"_onehot" for c in cat_feature] + [n+"_scaled" for n in num_feature]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

In [18]:
stages

[StringIndexer_2ae8a176a5b7,
 OneHotEncoder_09d3c08d98be,
 StringIndexer_23943f658660,
 OneHotEncoder_1f39b423d1f1,
 StringIndexer_f8dda883124f,
 OneHotEncoder_e9f568846756,
 VectorAssembler_e37cbee68dd0,
 StandardScaler_80f4ff9d7a7b,
 VectorAssembler_43fb4561ac13,
 StandardScaler_de43949a36d9,
 VectorAssembler_493b7994993e,
 StandardScaler_7c34e54b0423,
 VectorAssembler_7736a6464d12]

In [19]:
from pyspark.ml.pipeline import Pipeline

transform_stage = stages
pipeline = Pipeline(stages=transform_stage)
fitted_transformer = pipeline.fit(train_df)


                                                                                

In [20]:
vtrain_df = fitted_transformer.transform(train_df)

In [21]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector"
)

In [22]:
model = lr.fit(vtrain_df)

23/02/08 19:33:15 WARN Instrumentation: [7256c1e2] regParam is zero, which might cause numerical instability and overfitting.




23/02/08 19:33:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/02/08 19:33:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

23/02/08 19:33:37 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
23/02/08 19:33:37 WARN Instrumentation: [7256c1e2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
23/02/08 19:33:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/02/08 19:33:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

In [23]:
vtest_df = fitted_transformer.transform(test_df)

In [24]:
pred = model.transform(vtest_df)

In [25]:
pred.cache()

DataFrame[passenger_count: double, pickup_location_id: bigint, dropoff_location_id: bigint, trip_distance: double, pickup_time: int, day_of_week: string, total_amount: double, pickup_location_id_idx: double, pickup_location_id_onehot: vector, dropoff_location_id_idx: double, dropoff_location_id_onehot: vector, day_of_week_idx: double, day_of_week_onehot: vector, passenger_count_vector: vector, passenger_count_scaled: vector, trip_distance_vector: vector, trip_distance_scaled: vector, pickup_time_vector: vector, pickup_time_scaled: vector, feature_vector: vector, prediction: double]

In [26]:
pred.show()

[Stage 26:>                                                         (0 + 1) / 1]

23/02/08 19:35:02 WARN MemoryStore: Not enough space to cache rdd_94_0 in memory! (computed 252.3 MiB so far)
23/02/08 19:35:02 WARN BlockManager: Persisting block rdd_94_0 to disk instead.
23/02/08 19:35:05 WARN MemoryStore: Not enough space to cache rdd_94_0 in memory! (computed 252.3 MiB so far)
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+-----------------------+--------------------------+---------------+------------------+----------------------+----------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
|passenger_count|pickup_location_id|dropoff_location_id|trip_distance|pickup_time|day_of_week|total_amount|pickup_location_id_idx|pickup_location_id_onehot|dropoff_location_id_idx|dropoff_location_id_onehot|day_of_week_idx|day_of_week_onehot|passenger_count_vector|passenger_count_scaled|t

                                                                                

In [29]:
pred.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()

23/02/08 19:36:12 WARN MemoryStore: Not enough space to cache rdd_94_0 in memory! (computed 252.3 MiB so far)
+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|         13.6|     Sunday|      125.65| 73.71495057546655|
|          6.9|     Sunday|       32.25| 25.95872352592943|
|          5.5|     Sunday|       26.75|23.449497180098913|
|          1.4|   Saturday|        16.0| 13.92556924949832|
|          1.1|  Wednesday|       12.95|14.039882436661811|
|          1.7|  Wednesday|        14.3|16.378074531428755|
|          2.1|  Wednesday|       20.75|15.564671109637509|
|          2.4|   Saturday|       14.75|16.008819829878707|
|          3.1|  Wednesday|        18.8| 18.95922428968252|
|          1.9|   Thursday|        18.3|14.917739586136808|
|          2.4|    Tuesday|       18.35|15.822534456027952|
|          1.8|     Sunday|       17.15|14.8892562

In [30]:
print(model.summary.rootMeanSquaredError)
print(model.summary.r2)

5.883284185514268
0.7950109352980362
