In [1]:
# Importação das bibliotecas necessárias

import boto3
from botocore import UNSIGNED
from botocore.client import Config
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
# Algumas importações relativos ao Spark que iremos usar daqui em diante

import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# Criação de uma sessão spark

spark = SparkSession\
    .builder\
    .appName("AirBnB")\
    .config("spark.sql.shuffle.partitions",6)\
    .getOrCreate()

In [4]:
taxis_and_limosines_subamostra = spark.read.csv("/home/big/praticas/Trabalho_Grupo/taxis_and_limosines_subamostra", header="true", inferSchema="true", sep=',')

                                                                                

In [7]:
# Implementação do algoritmo utilizando os preditores mais correlacionados com a variável alvo

vectorAssembler = VectorAssembler(inputCols = ['trip_distance', 'rate_code', 'fare_amount', 'tip_amount', 'tolls_amount', 'trip_duration'], outputCol = 'features')

vhouse_df = vectorAssembler.transform(taxis_and_limosines_subamostra)

vhouse_df = vhouse_df.select(['features', 'total_amount'])

vhouse_df.show(3)

+--------------------+------------+
|            features|total_amount|
+--------------------+------------+
|[7.7,1.0,22.5,4.6...|        27.6|
|[3.5,1.0,13.5,3.0...|        17.5|
|[2.5,1.0,10.0,3.1...|       13.65|
+--------------------+------------+
only showing top 3 rows



In [8]:
splits = vhouse_df.randomSplit([0.7, 0.3])

subamostra_treino = splits[0]

subamostra_teste = splits[1]

In [9]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'total_amount', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(subamostra_treino)

22/04/09 09:29:11 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/09 09:29:11 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
                                                                                

In [10]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 1.091714
r2: 0.992579


In [13]:
# Avaliação da performance preditiva sobre o conjunto de teste

lr_predictions = lr_model.transform(subamostra_teste)
lr_predictions.select('prediction', 'total_amount', 'features').show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol = 'prediction', \
                 labelCol= 'total_amount', metricName = 'r2')
print('R quadrado sobre o conjunto de teste = %g' % lr_evaluator.evaluate(lr_predictions))

+-----------------+------------+--------------------+
|       prediction|total_amount|            features|
+-----------------+------------+--------------------+
|4.048450648231834|         3.0|[0.0,1.0,2.5,0.0,...|
| 4.04847057579203|         3.0|[0.0,1.0,2.5,0.0,...|
| 4.04847057579203|         3.5|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
+-----------------+------------+--------------------+
only showing top 5 rows





R quadrado sobre o conjunto de teste = 0.991596


                                                                                

In [14]:
test_result = lr_model.evaluate(subamostra_teste)
print('Root Mean Squared Error (RMSE) no conjunto de treino = %g' % test_result.rootMeanSquaredError)



Root Mean Squared Error (RMSE) no conjunto de treino = 1.18325


                                                                                

In [19]:
predictions = lr_model.transform(subamostra_teste)
predictions.select('prediction', 'total_amount','features').show()

[Stage 15:>                                                         (0 + 1) / 1]

+-----------------+------------+--------------------+
|       prediction|total_amount|            features|
+-----------------+------------+--------------------+
|4.048450648231834|         3.0|[0.0,1.0,2.5,0.0,...|
| 4.04847057579203|         3.0|[0.0,1.0,2.5,0.0,...|
| 4.04847057579203|         3.5|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.5|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.5|[0.0,1.0,2.5,0.0,...|
|4.048490503352226|         3.5|[0.0,1.0,2.5,0.0,...|
|4.048510430912422|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048510430912422|         3.0|[0.0,1.0,2.5,0.0,...|
|4.048510430912422|         

                                                                                