# 1.0 Libraries For Preprocessing

In [1]:
import time
start = time.time()
print("Importing libraries for the preprocessing")
# Library for Wrangling and loading
import pandas as pd
import numpy as np

# Libraries for Visualization
import seaborn as sns

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Libraries for Partitioning and Spark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import shutil
import hashlib

# Others
import warnings
import os


# Configuration
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

warnings.filterwarnings("ignore")
end = time.time()
print("Imported all libraries | Time Taken {}sec".format(str(end-start)))

Importing libraries for the preprocessing
Imported all libraries | Time Taken 31.777028799057007sec


# 2.0 Partitioning

## 2.1 Partitioning Parameter

In [13]:
N_PARTITION = 50    # Number of buckets
base_partitions_dir = "../data/external/Partition"
output_dir = "../data/external/output"

## 2.2 Collecting data columns

In [21]:
# LOADING PARAMETER
chunksize= 1e5
data_directory = r"..\Data\external\used_cars_data.csv"
# loading chunk of the data for the column list
def col_list(data_path, chunksize, progress=None):
    print("Checking and loading file")
    time.sleep(0.9)
    for df_iter, chunk in enumerate(pd.read_csv(data_path, chunksize=chunksize, iterator=False)):
        pass
    col_list = list(chunk.columns)
    return col_list

## 2.3 Function for Hashing id

In [48]:
# hashing the listing id to allow even partitioning across the dataset
def hash_(listing_id):
    """Creates an hashed column using the listing id for the vehicle"""
    return int(hashlib.md5(str(listing_id).encode("utf-8")).hexdigest(), 16)

## 2.4 Creating dir for partition

In [23]:
def create_partition():
    """Creates an empty partition directory for the buckets"""
    start = time.time()
    print("Checking if the directory exists...")
    time.sleep(0.9)
    if os.path.exists(base_partitions_dir):
        print("Directory found")
        time.sleep(0.4)
        print("Removing directory")
        time.sleep(1)
        shutil.rmtree(base_partitions_dir)
        print("Removed the directory")
    else:
        print("No Such Directory found.")

    # Delaying before creating the directories
    time.sleep(2.5)

    print("Creating empty folder list for partition")
    time.sleep(0.9)
    if not os.path.exists(base_partitions_dir):
        # Creating partition directory
        os.mkdir(base_partitions_dir)
        # Making a new directory for the partitions
        for i in range(N_PARTITION):
            partition_path = os.path.join(
                base_partitions_dir, "p{}".format(i)).replace("\\", "/")
            # Printing the path
            print('| {} | Partition left {} |'.format(partition_path,N_PARTITION-i))
            if not os.path.exists(partition_path):
                os.mkdir(partition_path)
            else:
                print("Path Already exist")
            time.sleep(0.6)
    end = time.time()
    print("| Completed | Time Taken ------------------------- {}sec |".format(str(end-start)))
# Making the directory
dir = create_partition()


Checking if the directory exists...
Directory found
Removing directory
Removed the directory
Creating empty folder list for partition
| ../data/external/Partition/p0 | Partition left 50 |
| ../data/external/Partition/p1 | Partition left 49 |
| ../data/external/Partition/p2 | Partition left 48 |
| ../data/external/Partition/p3 | Partition left 47 |
| ../data/external/Partition/p4 | Partition left 46 |
| ../data/external/Partition/p5 | Partition left 45 |
| ../data/external/Partition/p6 | Partition left 44 |
| ../data/external/Partition/p7 | Partition left 43 |
| ../data/external/Partition/p8 | Partition left 42 |
| ../data/external/Partition/p9 | Partition left 41 |
| ../data/external/Partition/p10 | Partition left 40 |
| ../data/external/Partition/p11 | Partition left 39 |
| ../data/external/Partition/p12 | Partition left 38 |
| ../data/external/Partition/p13 | Partition left 37 |
| ../data/external/Partition/p14 | Partition left 36 |
| ../data/external/Partition/p15 | Partition left 3

## 2.5 Creating blank partition

In [24]:
chunksize= 1e5
data_directory = r"..\Data\external\used_cars_data.csv"
def create_blank_partition():
    """Creating a blank partition with the number of bucket"""
    start = time.time()
    data_list = col_list(data_directory, chunksize)
    for i in range(N_PARTITION):
        time.sleep(0.9)
        file_base_dir = os.path.join(base_partitions_dir,"p{}".format(str(i)),"").replace("\\","/")
        print(file_base_dir)
        # Opening the file and writing it to the partition created
        with open(file_base_dir+"vehicle_used_data.csv", "w") as f:
            f.write(",".join(data_list))
    end = time.time()
    print("Time taken ------------------- | {}sec".format(str(end-start)))
    return file_base_dir

dir_path = create_blank_partition()

Checking and loading file
../data/external/Partition/p0/
../data/external/Partition/p1/
../data/external/Partition/p2/
../data/external/Partition/p3/
../data/external/Partition/p4/
../data/external/Partition/p5/
../data/external/Partition/p6/
../data/external/Partition/p7/
../data/external/Partition/p8/
../data/external/Partition/p9/
../data/external/Partition/p10/
../data/external/Partition/p11/
../data/external/Partition/p12/
../data/external/Partition/p13/
../data/external/Partition/p14/
../data/external/Partition/p15/
../data/external/Partition/p16/
../data/external/Partition/p17/
../data/external/Partition/p18/
../data/external/Partition/p19/
../data/external/Partition/p20/
../data/external/Partition/p21/
../data/external/Partition/p22/
../data/external/Partition/p23/
../data/external/Partition/p24/
../data/external/Partition/p25/
../data/external/Partition/p26/
../data/external/Partition/p27/
../data/external/Partition/p28/
../data/external/Partition/p29/
../data/external/Partiti

## 2.6 Partitioning by hashing

In [4]:
N_PARTITION = 50    # Number of buckets
# Partitioing and hashing the 
def partition_by_hashing(df, progress=None):
    # hashing the listing_id column into the number of partitions
    df["hashed"] = df["listing_id"].apply(hash_) % N_PARTITION
    for partitions, data in df.groupby("hashed"):
        start = time.time()
        print("Data partition: {}".format(str(partitions)))
        # Removing hash columns from the dataset
        data = data.drop("hashed", axis=1)
        # Wrting the data to the partition
        path_dir = os.path.join(base_partitions_dir,"p{}".format(partitions),"").replace("\\","/")
        print("| writing data partition to {} |".format(path_dir))
        # Writing the data to the path
        with open(path_dir+"vehicle_used_data.csv", "a", encoding="utf-8") as f:
            f.write("\n")
            data.to_csv(f, header=False, index=False)
            
        end = time.time()
        print("Time taken {}sec".format(str(end-start)))
    

## 2.7 Writing data to each partition

In [26]:
chunksize = 1e5
for df_iter, data in enumerate(pd.read_csv(r"..\Data\external\used_cars_data.csv", iterator=True, chunksize=chunksize, encoding="latin1"),1):
    print("Iter:",df_iter)
    partition = partition_by_hashing(df=data)
    print("\n")
    #data = partition_by_hashing(df, name="listing_id", progress=None)
    

Iter: 1


Data partition: 0
| writing data partition to ../data/external/Partition/p0/ |
Time taken 1.7771995067596436sec
Data partition: 1
| writing data partition to ../data/external/Partition/p1/ |
Time taken 0.3827812671661377sec
Data partition: 2
| writing data partition to ../data/external/Partition/p2/ |
Time taken 0.4177563190460205sec
Data partition: 3
| writing data partition to ../data/external/Partition/p3/ |
Time taken 0.45574069023132324sec
Data partition: 4
| writing data partition to ../data/external/Partition/p4/ |
Time taken 0.3867764472961426sec
Data partition: 5
| writing data partition to ../data/external/Partition/p5/ |
Time taken 0.3787803649902344sec
Data partition: 6
| writing data partition to ../data/external/Partition/p6/ |
Time taken 0.40177106857299805sec
Data partition: 7
| writing data partition to ../data/external/Partition/p7/ |
Time taken 0.47272825241088867sec
Data partition: 8
| writing data partition to ../data/external/Partition/p8/ |
Time taken 0

# 3.0 Data Distribution with Spark

 ## 3.1 Data cleaning in parallel(configuration)

# 4.0 Data Cleaning

## 4.1 Loading first Partition

In [5]:
# LOADING FIRST PARTITION FILE
PARTITION = 0
TARGET = "price"
PARTITION_DIR = os.path.join("..\Data\external\Partition", "p{}".format(str(PARTITION)),"vehicle_used_data.csv").replace("\\","/")

used_data = pd.read_csv(PARTITION_DIR, header=0, delimiter=",")
used_data.head(2)

Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,engine_type,exterior_color,fleet,frame_damaged,franchise_dealer,franchise_make,front_legroom,fuel_tank_volume,fuel_type,has_accidents,height,highway_fuel_economy,horsepower,interior_color,isCab,is_certified,is_cpo,is_new,is_oemcpo,latitude,length,listed_date,listing_color,listing_id,longitude,main_picture_url,major_options,make_name,maximum_seating,mileage,model_name,owner_count,power,price,salvage,savings_amount,seller_rating,sp_id,sp_name,theft_title,torque,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,1GNERGKW2JJ194328,38.4 in,,,,SUV / Crossover,,Bay Shore,18.0,,83,11706,"Here at Atlantic Chevrolet-Cadillac, 90% of ou...",V6,3600.0,V6,Satin Steel Metallic,False,False,True,Chevrolet,41 in,19.4 gal,Gasoline,False,70.7 in,27.0,310.0,Dark Atmosphere/Medium Ash Gray,False,,True,False,True,40.7333,204.3 in,2020-06-18,UNKNOWN,274409090,-73.2587,https://static.cargurus.com/images/forsale/202...,"['Driver Confidence Package', 'Power Package',...",Chevrolet,8 seats,29695.0,Traverse,1.0,"310 hp @ 6,800 RPM",26343.0,False,1685,3.447761,314501.0,Atlantic Chevrolet Cadillac,False,"266 lb-ft @ 2,800 RPM",A,9-Speed Automatic,t73662,LT Cloth FWD,,FWD,Front-Wheel Drive,120.9 in,78.6 in,2018
1,1C4RDJEG9HC611713,38.6 in,,,,SUV / Crossover,,Bronx,18.0,,66,10466,FACTORY CERTIFIED WARRANTY AVAILABLE THROUGH 2...,V6,3600.0,V6,White Knuckle Clearcoat,False,False,True,Dodge,40.3 in,24.6 gal,Gasoline,False,70.9 in,25.0,295.0,Sepia/Black,False,,True,False,True,40.8847,201.2 in,2020-07-05,WHITE,275894461,-73.8317,https://static.cargurus.com/images/forsale/202...,"['Leather Seats', 'Sunroof/Moonroof', 'Navigat...",Dodge,7 seats,65508.0,Durango,1.0,"360 hp @ 5,150 RPM",27200.0,False,965,2.8,62178.0,Eastchester Chrysler Jeep Dodge Ram,False,"390 lb-ft @ 4,250 RPM",A,8-Speed Automatic,t68694,Citadel AWD,,AWD,All-Wheel Drive,119.8 in,85.5 in,2017


In [6]:
data = used_data.copy()

In [7]:
def clean_data(data):
    """Removing some irrelevant columns in the dataframe"""
    

    # Defining some columns to remove
    cols = ["vin", 'description', "exterior_color", "wheel_system", "vehicle_damage_category", "trimId",
            "theft_title", "sp_id", "main_picture_url", "longitude", "listing_id", "listing_color", "latitude",
                "interior_color", "cabin", "major_options", "back_legroom", "bed", "bed_height", "bed_length", 
                 "is_certified","owner_count","is_cpo", "is_oemcpo", "salvage", "wheelbase", "width","combine_fuel_economy",
                     "daysonmarket","dealer_zip","engine_cylinders","horsepower",
                    "franchise_dealer","front_legroom","fuel_tank_volume","height","length","franchise_make","savings_amount","transmission_display","trim_name","sp_name"
                    ]
    # Dropping the columns
    data = data.drop(columns=cols)
    # data = data.drop(cols, axis=1)
    
    # listed date to pandas datetime
    data["listed_date"] = pd.to_datetime(data["listed_date"])
    
    # Transforming
    data["transmission"] = data["transmission"].apply(lambda inf: str(inf).replace("A","Automatic").replace("M","Manual"))
    data["Vehicle_power"] = data["power"].apply(lambda inf: str(inf).split("@")[0].strip().split(" ")[0])
    data["Vehicle_torque"] = data["torque"].apply(lambda inf: str(inf).split("@")[0].strip().split(" ")[0])
    data["max_seating"] = data["maximum_seating"].apply(lambda inf: str(inf).strip().split(" ")[0])
    data["Listing_year"] = data["listed_date"].apply(lambda inf: inf.year)
    data["engine_displacement"] = data["engine_displacement"].apply(lambda inf: inf/1000)
    data["combined_MPG"] = (data["highway_fuel_economy"] + data["city_fuel_economy"])/2
    
    # Replacing any "nan" with np.nan value ===> Null
    data = data.replace({"nan":np.nan, "_": np.nan,"--":np.nan})
    
    # Changing datatype
    data["Vehicle_power"] = data['Vehicle_power'].astype(np.float).astype("Int32")
    data["Vehicle_torque"] = data['Vehicle_torque'].astype(np.float).astype("Int32")
    data["max_seating"] = data['max_seating'].astype(np.float).astype("Int32")


    # data["isCab"] = pd.DataFrame(data["isCab"])
    # data["isCab"] = data["isCab"].map({"False":False,"True":True})
    
    
    # Dropping some more columns
    data = data.drop(columns=["listed_date","maximum_seating","power",'torque'])
    # Renaming some columns
    data = data.rename(columns={"wheel_system_display":"Drivetrain",
                                "transmission":"Transmission",
                                "body_type":"body_style",
                                "city_fuel_economy":"city_MPG",
                                "engine_displacement":"engine_size",
                                "highway_fuel_economy":"highway_MPG",
                                "year": "Vehicle_year"})
    
    
    return data
    

In [None]:
def clean_data(data):
    """C
    
    """
    # Filtering the data
    filtered_data = data[data["make_name"].value_counts() > 10000]
    # Dropping some columns
    filtered_data = filtered_data.drop(
        ["Unnamed: 0", "listed_date", "listed_year"], axis=1)

    # Data Cleaning
    filtered_data["fuel_tank_volume"] = filtered_data["fuel_tank_volume"].apply(
        lambda volume: int(volume.strip().split(" ")[0]))
    filtered_data["maximum_seating"] = filtered_data["maximum_seating"].apply(
        lambda seating: int(seating.strip().split(" ")[0]))
    filtered_data["transmission"] = filtered_data["transmission"].apply(
        lambda trans: trans.replace("A", "Automatic").replace("M", "Manual"))
    filtered_data["engine_displacement"] = filtered_data["engine_displacement"].apply(
        lambda disp: disp/1000)
    filtered_data["engine_cylinders"] = filtered_data["engine_cylinders"].apply(
        lambda cyl: cyl.strip().split(" ")[0].strip())
    filtered_data["mpg"] = (
        filtered_data["city_fuel_economy"] + filtered_data["highway_fuel_economy"])/2

    # Renaming some columns
    filtered_data = filtered_data.rename(columns={"fuel_tank_volume": "Tank_Volume",
                                                  "engine_displacement": "engine_size",
                                                  "city_fuel_economy": "MPG city",
                                                  "highway_fuel_economy": "MPG highway",
                                                  "wheel_system": "Drivetrain"})
    return filtered_data


In [8]:
new_data = clean_data(data)
new_data.head(2)

Unnamed: 0,body_style,city,city_MPG,engine_size,engine_type,fleet,frame_damaged,fuel_type,has_accidents,highway_MPG,isCab,is_new,make_name,mileage,model_name,price,seller_rating,Transmission,Drivetrain,Vehicle_year,Vehicle_power,Vehicle_torque,max_seating,Listing_year,combined_MPG
0,SUV / Crossover,Bay Shore,18.0,3.6,V6,False,False,Gasoline,False,27.0,False,False,Chevrolet,29695.0,Traverse,26343.0,3.447761,Automatic,Front-Wheel Drive,2018,310,266,8,2020,22.5
1,SUV / Crossover,Bronx,18.0,3.6,V6,False,False,Gasoline,False,25.0,False,False,Dodge,65508.0,Durango,27200.0,2.8,Automatic,All-Wheel Drive,2017,360,390,7,2020,21.5


In [9]:
for feature in new_data.columns:
    if new_data[feature].dtype == object:
        print(feature, new_data[feature].nunique())

body_style 9
city 3660
engine_type 33
fleet 2
frame_damaged 2
fuel_type 8
has_accidents 2
isCab 2
make_name 57
model_name 819
Transmission 4
Drivetrain 5


## Dealing with the missing values

In [10]:
def impute_missing_values(new_data, verbose: int):
    """Dealing with the missing values by imputation"""
    # Performing input validation
    if (verbose not in [0, 1]):
        raise TypeError("Code ran into an Exception \
                        Because verbose is either a string or not 0 or 1")
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.preprocessing import LabelEncoder
    from sklearn.neighbors import KNeighborsRegressor
    cat_fea = [
        feature for feature in new_data.columns if new_data[feature].dtype == object]
    num_missing_fea = [feature for feature in new_data.columns if new_data[feature].isnull(
    ).sum() > 0 and new_data[feature].dtype != object]
    not_null_fea = [
        feature for feature in new_data.columns if new_data[feature].isnull().sum() == 0]

    # Label encoding the categorical feature
    le = LabelEncoder()
    print("Encoding the categorical feature")
    for feature in cat_fea:
        new_data[feature] = le.fit_transform(new_data[feature])
        
    # imputing the missing features
    estimator = RandomForestRegressor(random_state=42)
    estimator_neighbour = KNeighborsRegressor(n_neighbors=5)
    print("Imputing the missing values")
    imputer = IterativeImputer(
        estimator=estimator, max_iter=7, verbose=verbose, random_state=42
    )
    imputer.fit(new_data)
    transformed = imputer.transform(new_data)
    transformed_data = pd.DataFrame(transformed, columns=new_data.columns)
    
    # Reverting the encoded cat features
    print("Reverting encoded feature to original")
    for feature in cat_fea:
        transformed_data[feature] = le.inverse_transform(
            transformed_data[feature])

    return transformed_data

In [11]:
used_data.isnull().sum()

vin                            0
back_legroom                3262
bed                        60026
bed_height                 51625
bed_length                 51625
body_type                    276
cabin                      59153
city                           0
city_fuel_economy          10046
combine_fuel_economy       60422
daysonmarket                   0
dealer_zip                     0
description                 1586
engine_cylinders            2088
engine_displacement         3487
engine_type                 2088
exterior_color                 1
fleet                      28630
frame_damaged              28630
franchise_dealer               0
franchise_make             11611
front_legroom               3262
fuel_tank_volume            3262
fuel_type                   1740
has_accidents              28630
height                      3262
highway_fuel_economy       10046
horsepower                  3487
interior_color                 4
isCab                      28630
is_certifi

In [12]:
data.isnull().sum()

vin                            0
back_legroom                3262
bed                        60026
bed_height                 51625
bed_length                 51625
body_type                    276
cabin                      59153
city                           0
city_fuel_economy          10046
combine_fuel_economy       60422
daysonmarket                   0
dealer_zip                     0
description                 1586
engine_cylinders            2088
engine_displacement         3487
engine_type                 2088
exterior_color                 1
fleet                      28630
frame_damaged              28630
franchise_dealer               0
franchise_make             11611
front_legroom               3262
fuel_tank_volume            3262
fuel_type                   1740
has_accidents              28630
height                      3262
highway_fuel_economy       10046
horsepower                  3487
interior_color                 4
isCab                      28630
is_certifi

In [3]:
# SPARK PARAMETER

# SPARK_HOME = r"C:\spark\spark" # The spark location
# LOG_DIR = r"C:\spark\tmp\spark-event" # Eventlogging location

import findspark
findspark.init()

# Spark configuration
import pyspark
conf = pyspark.SparkConf()

# Enabling logging
conf.set("spark.eventLog.enabled", False)
conf.set("spark.eventLog.dir", r"\tmp")
conf.set("spark.eventLog.compress",True)

# Enabling all cores usage
conf.set("spark.num.executor", 1)
conf.set("spark.executor.memory", "8g")
conf.set("spark.executor.cores",2)

# setting the appname for the spark ui
conf.setAppName("Data Cleaning")

# Set parent
conf.set("spark.master", "local[2]")
conf.getAll()


sc = pyspark.SparkContext(conf=conf)
sc

In [75]:
sc.stop()

In [14]:
a = sc.parallelize(range(0,100), N_PARTITION).map(lambda x: clean_data(x))

In [None]:
#using the main thing 

In [15]:
a.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, DESKTOP-6K10A22, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 605, in main
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 597, in process
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_9052/3703805585.py", line 1, in <lambda>
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_9052/637842667.py", line 13, in clean_data
AttributeError: 'int' object has no attribute 'drop'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2135)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2154)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2179)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 605, in main
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 597, in process
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_9052/3703805585.py", line 1, in <lambda>
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_9052/637842667.py", line 13, in clean_data
AttributeError: 'int' object has no attribute 'drop'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [17]:
SELECT FROM table *


SyntaxError: invalid syntax (Temp/ipykernel_9052/3213417746.py, line 1)

In [82]:
print(list(range(0,50)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [52]:
# Parallelizing the process
start = time.time()
sc.parallelize(list(range(0,100)),N_PARTITION).map(lambda x: clean_data(x)).collect()
sc.stop()
end = time.time()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, DESKTOP-6K10A22, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 605, in main
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 597, in process
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_10492/1983187250.py", line 3, in <lambda>
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_10492/637842667.py", line 13, in clean_data
AttributeError: 'int' object has no attribute 'drop'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2135)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2154)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2179)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 605, in main
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\worker.py", line 597, in process
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\spark\spark\python\lib\pyspark.zip\pyspark\util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_10492/1983187250.py", line 3, in <lambda>
  File "C:\Users\PSALIS~1\AppData\Local\Temp/ipykernel_10492/637842667.py", line 13, in clean_data
AttributeError: 'int' object has no attribute 'drop'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [61]:
used_data.head()

Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,engine_type,exterior_color,fleet,frame_damaged,franchise_dealer,franchise_make,front_legroom,fuel_tank_volume,fuel_type,has_accidents,height,highway_fuel_economy,horsepower,interior_color,isCab,is_certified,is_cpo,is_new,is_oemcpo,latitude,length,listed_date,listing_color,listing_id,longitude,main_picture_url,major_options,make_name,maximum_seating,mileage,model_name,owner_count,power,price,salvage,savings_amount,seller_rating,sp_id,sp_name,theft_title,torque,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,1GNERGKW2JJ194328,38.4 in,,,,SUV / Crossover,,Bay Shore,18.0,,83,11706,"Here at Atlantic Chevrolet-Cadillac, 90% of ou...",V6,3600.0,V6,Satin Steel Metallic,False,False,True,Chevrolet,41 in,19.4 gal,Gasoline,False,70.7 in,27.0,310.0,Dark Atmosphere/Medium Ash Gray,False,,True,False,True,40.7333,204.3 in,2020-06-18,UNKNOWN,274409090,-73.2587,https://static.cargurus.com/images/forsale/202...,"['Driver Confidence Package', 'Power Package',...",Chevrolet,8 seats,29695.0,Traverse,1.0,"310 hp @ 6,800 RPM",26343.0,False,1685,3.447761,314501.0,Atlantic Chevrolet Cadillac,False,"266 lb-ft @ 2,800 RPM",A,9-Speed Automatic,t73662,LT Cloth FWD,,FWD,Front-Wheel Drive,120.9 in,78.6 in,2018
1,1C4RDJEG9HC611713,38.6 in,,,,SUV / Crossover,,Bronx,18.0,,66,10466,FACTORY CERTIFIED WARRANTY AVAILABLE THROUGH 2...,V6,3600.0,V6,White Knuckle Clearcoat,False,False,True,Dodge,40.3 in,24.6 gal,Gasoline,False,70.9 in,25.0,295.0,Sepia/Black,False,,True,False,True,40.8847,201.2 in,2020-07-05,WHITE,275894461,-73.8317,https://static.cargurus.com/images/forsale/202...,"['Leather Seats', 'Sunroof/Moonroof', 'Navigat...",Dodge,7 seats,65508.0,Durango,1.0,"360 hp @ 5,150 RPM",27200.0,False,965,2.8,62178.0,Eastchester Chrysler Jeep Dodge Ram,False,"390 lb-ft @ 4,250 RPM",A,8-Speed Automatic,t68694,Citadel AWD,,AWD,All-Wheel Drive,119.8 in,85.5 in,2017
2,1GNKRGKD8HJ199237,36.8 in,,,,SUV / Crossover,,Bay Shore,15.0,,18,11706,"Here at Atlantic Chevrolet-Cadillac, 90% of ou...",V6,3600.0,V6,Tungsten Metallic,False,False,True,Chevrolet,41.3 in,22 gal,Gasoline,False,69.9 in,22.0,281.0,Black (Ebony),False,,True,False,True,40.7333,203.7 in,2020-08-22,GRAY,280096673,-73.2587,https://static.cargurus.com/images/forsale/202...,"['Power Package', 'Navigation System', 'Prefer...",Chevrolet,8 seats,59216.0,Traverse,1.0,"281 hp @ 6,300 RPM",19629.0,False,1634,3.447761,314501.0,Atlantic Chevrolet Cadillac,False,"266 lb-ft @ 3,400 RPM",A,6-Speed Automatic,t67377,1LT FWD,,FWD,Front-Wheel Drive,118.9 in,78.5 in,2017
3,SALWR2RY1LA746050,37 in,,,,SUV / Crossover,,San Juan,,,5,922,"[!@@Additional Info@@!]360 Surround Camera,Ada...",I4,2000.0,I4,Carpathian Gray Premium,,,True,Land Rover,39.5 in,24.1 gal,Gasoline,,71 in,,398.0,Gray (Ebony/Ebony/Ebony/Ebony),,,,True,,18.4439,192.1 in,2020-09-04,GRAY,281290574,-66.0785,https://static.cargurus.com/images/forsale/202...,"['Adaptive Cruise Control', 'Backup Camera', '...",Land Rover,5 seats,5.0,Range Rover Sport,,"398 hp @ 5,500 RPM",93515.0,,0,3.0,389227.0,Land Rover San Juan,,"472 lb-ft @ 1,500 RPM",A,8-Speed Automatic Overdrive,t86692,Hybrid Plug-in HSE 4WD,,AWD,All-Wheel Drive,115.1 in,87.4 in,2020
4,JM1BPANM5K1139351,35.1 in,,,,Hatchback,,Bayamon,,,412,960,"[!@@Additional Info@@!]4-Wheel Disc Brakes,A/C...",I4,2500.0,I4,GRIS,,,True,Jeep,42.3 in,13.2 gal,Gasoline,,56.7 in,,186.0,Black,,,,True,,18.3988,175.6 in,2019-07-25,GRAY,247331160,-66.1582,https://static.cargurus.com/images/forsale/201...,"['Leather Seats', 'Sunroof/Moonroof', 'Adaptiv...",Mazda,5 seats,5.0,MAZDA3,,"186 hp @ 6,000 RPM",32595.0,,0,2.8,370599.0,Flagship Chrysler,,"186 lb-ft @ 4,000 RPM",,,t85263,Premium Hatchback FWD,,FWD,Front-Wheel Drive,107.3 in,70.7 in,2019
