In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "6g"
spark = SparkSession.builder.appName("taxi-fare-prediction")\
        .config("spark.executor.memory", MAX_MEMORY)\
        .config("spark.driver.memory", MAX_MEMORY)\
        .getOrCreate()

22/10/06 21:11:39 WARN Utils: Your hostname, Moon-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.4 instead (on interface en0)
22/10/06 21:11:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 21:11:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
trip_files = "/Users/sig6774/Desktop/Data_Engineering/data-engineering-main/01-spark/data/tripdata_2021_1-7.csv"
trips_df = spark.read.csv(f"file:///{trip_files}",inferSchema=True, header=True)
# 가지고 온 csv데이터를 spark의 df로 변환

                                                                                

In [4]:
trips_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [5]:
# sql에서 사용하기 위해 tempview 생성 
trips_df.createOrReplaceTempView("trips")

In [6]:
# tripdistance로 부터 total_amount를 예측하기 위해 데이터를 가지고 옴 
# 이상치를 제거하고 가져옴 

query = """
SELECT 
    trip_distance,
    total_amount
FROM 
    trips
WHERE 
    total_amount < 5000
    AND total_amount > 0 
    AND trip_distance > 0 
    AND trip_distance < 500 
    AND passenger_count < 4 
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""

In [7]:
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")

In [8]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|          0.2|         4.3|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         0.73|       12.09|
|         1.17|       12.36|
|         0.78|        9.96|
|         1.66|        12.3|
|         0.93|         9.3|
|         1.16|       11.84|
+-------------+------------+
only showing top 20 rows



In [9]:
data_df.describe().show()
# 각 컬럼에 대한 통계를 볼 수 있음 
# 이상치 여부 확인 

                                                                                

+-------+------------------+-----------------+
|summary|     trip_distance|     total_amount|
+-------+------------------+-----------------+
|  count|          13126271|         13126271|
|   mean|2.8820783305479507|17.97311724190838|
| stddev|3.8202841753877816|12.97582928299237|
|    min|              0.01|             0.01|
|    max|             475.5|           4973.3|
+-------+------------------+-----------------+



In [10]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed = 410)

In [12]:
print(train_df.count())
print(test_df.count())

                                                                                

10499417




2626854


                                                                                

In [13]:
# train_df를 vector assemble를 통해 train가능하도록 변환 
from pyspark.ml.feature import VectorAssembler
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")
# input column의 이름을 작성해주면 학습을 할 수 있는 데이터로 변환 

In [14]:
vtrain_df = vassembler.transform(train_df)
# df로 변환 

In [15]:
vtrain_df.show()

[Stage 15:>                                                         (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



                                                                                

In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features"
    
    )

In [19]:
model = lr.fit(vtrain_df)
# 학습데이터로 모델을 학습 

22/10/06 21:25:13 WARN Instrumentation: [7e4a6262] regParam is zero, which might cause numerical instability and overfitting.


[Stage 16:>                                                        (0 + 8) / 13]

22/10/06 21:25:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/06 21:25:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

22/10/06 21:25:23 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [20]:
vtest_df = vassembler.transform(test_df)
vtest_df.show()

[Stage 20:>                                                         (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|        3.96|  [0.01]|
|         0.01|         4.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



                                                                                

In [21]:
pred = model.transform(vtest_df)

In [22]:
pred.show()

[Stage 21:>                                                         (0 + 1) / 1]

+-------------+------------+--------+---------------+
|trip_distance|total_amount|features|     prediction|
+-------------+------------+--------+---------------+
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.3|  [0.01]|9.4185463035608|
|         0.01|         3.8|  [0.01]|9.4185463035608|
|         0.01|         3.8|  [0.01]|9.4185463035608|
|         0.01|         3.8|  [0.01]|9.4185463035608|
|         0.01|         3.8|  [0.01]|9.4185463035608|
|         0.01|         3.8|

                                                                                

In [24]:
model.summary.rootMeanSquaredError

6.290177081306031

In [25]:
model.summary.r2

0.7659029964499398

In [27]:
from pyspark.sql.types import DoubleType
distance = [1.1, 5.5, 10.5, 30.0]

distance_df = spark.createDataFrame(distance, DoubleType()).toDF("trip_distance")
# spark의 df로 변환 

In [28]:
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [29]:
# 모델에 넣기 위해서는 사전에 정의해둔 VectorAssembler 사용해야함
vdistance_df = vassembler.transform(distance_df)
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [30]:
model.transform(vdistance_df).show()
# 모델에 데이터를 넣었을 때 나오는 예측값을 볼 수 있음 

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.665084992390714|
|          5.5|   [5.5]|25.770378782162844|
|         10.5|  [10.5]| 40.66275808872208|
|         30.0|  [30.0]|  98.7430373843031|
+-------------+--------+------------------+

