In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
import hashlib
import os
import shutil

# Partitioning the data
# The dataset contain more than 3million records, 
# we will partition these records into partition of 50 buckets 
# making about 60,000 records per bucket
N_PARTITION = 50

# Making the base directory
base_partitions_dir = "../data/external/partition"
output_dir = "../data/external/output"

# Making sure the directory does not exist
if os.path.exists(base_partitions_dir):
    shutil.rmtree(base_partitions_dir)

# hashing the listing id to allow even partitioning across the dataset
def hash_(listing_id):
    return int(hashlib.md5(str(listing_id).encode("utf-8")).hexdigest(), 16)

print("Creating empty folder list for partition")
if not os.path.exists(base_partitions_dir+ "/root"):
    # Making a new directory for the partitions
    for i in range(N_PARTITION):
        path = base_partitions_dir+"/{i}"
        print(path)
        if not os.path.exists(path):
            os.mkdir(path)
            
def create_blank_partition():
    ""
    for i in range(N_PARTITION):
        dir = base_partitions_dir+"/{i}/"
        with open(dir+"used_cars_data.csv", "w") as f:
            f.write(",".join(list(used_cars_data.columns)))
     
        return dir

def partition_by_hashing(df, name , progress= None):
    # hashing the listing_id column into the number of partitions
    df["partition"] = df["listing_id"].apply(hash_) % N_PARTITION
    
    for partitions, data in df.groupby("partition"):
        # Wrting the data to the partition 
        route = base_partitions_dir+"\{partitions}"
        with open(route, "w") as f:
            f.write(route, data)
        
dir = create_blank_partition()
os.listdir(dir)


# Making a sparksession
SPARK_SESSION = SparkSession \
    .builder \
    .appName("Preprocessing with Spark") \
    .getOrCreate()

# Reading the data
df = SPARK_SESSION.read.csv(
    r"..\data\external\used_cars_data.csv", header=True, inferSchema=True )

#using the main file for the above and that is the only thing for now



df.printSchema()
# Using the current apache spark

root
 |-- vin: string (nullable = true)
 |-- back_legroom: string (nullable = true)
 |-- bed: string (nullable = true)
 |-- bed_height: string (nullable = true)
 |-- bed_length: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- cabin: string (nullable = true)
 |-- city: string (nullable = true)
 |-- city_fuel_economy: string (nullable = true)
 |-- combine_fuel_economy: string (nullable = true)
 |-- daysonmarket: string (nullable = true)
 |-- dealer_zip: string (nullable = true)
 |-- description: string (nullable = true)
 |-- engine_cylinders: string (nullable = true)
 |-- engine_displacement: string (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- exterior_color: string (nullable = true)
 |-- fleet: string (nullable = true)
 |-- frame_damaged: string (nullable = true)
 |-- franchise_dealer: string (nullable = true)
 |-- franchise_make: string (nullable = true)
 |-- front_legroom: string (nullable = true)
 |-- fuel_tank_volume: string (nullable = tr

In [6]:
df.count()

3000507

In [7]:
len(df.columns)

66

In [20]:
# Using Pandas and other libraries for the data cleaning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 40)

warnings.filterwarnings("ignore")

In [22]:
# Making list of the variable to use for the dataframe
cols = ["region","price","year","manufacturer","model",
            "condition","cylinders","fuel","odometer","transmission",
                "drive","size","type","state","lat","long","posting_date"]
# Reading the file
data_f = pd.read_csv(
    r"..\data\external\vehicles.csv", sep=",", usecols=cols)

# Making a copy of the data
data = data_f.copy()
# converting the year posted to pandas datetime format
data['posting_date'] = pd.to_datetime(data['posting_date'])



In [24]:
data.isnull().sum()

region               0
price                0
year              1205
manufacturer     17646
model             5277
condition       174104
cylinders       177678
fuel              3013
odometer          4400
transmission      2556
drive           130567
size            306361
type             92858
state                0
lat               6549
long              6549
posting_date        68
dtype: int64

In [28]:
def del_var(dataset):
    """[summary]

    Args:
        dataset ([type]): [description]

    Returns:
        [type]: [description]
    """
    for feature in dataset.columns:
        if dataset[feature].isnull().mean() > 0.5:
            dataset = dataset.drop(feature, axis=1)
            print("dropped {}".format(feature))
            
    return dataset

data_ = del_var(data)


dropped size


In [26]:
data_.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,state,lat,long,posting_date
0,prescott,6000,,,,,,,,,,,az,,,
1,fayetteville,11900,,,,,,,,,,,ar,,,
2,florida keys,21000,,,,,,,,,,,fl,,,
3,worcester / central MA,1500,,,,,,,,,,,ma,,,
4,greensboro,4900,,,,,,,,,,,nc,,,


In [13]:
df.select("type").distinct().collect()

[Row(type='van'),
 Row(type='mini-van'),
 Row(type='offroad'),
 Row(type='wagon'),
 Row(type=None),
 Row(type='coupe'),
 Row(type='bus'),
 Row(type='SUV'),
 Row(type='other'),
 Row(type='convertible'),
 Row(type='-121.7473'),
 Row(type='sedan'),
 Row(type='hatchback'),
 Row(type='truck'),
 Row(type='pickup'),
 Row(type=' used cars'),
 Row(type=' 645'),
 Row(type=' accuracy'),
 Row(type=' GMC '),
 Row(type=' Orlando Car Deals'),
 Row(type=' dually'),
 Row(type=' S550')]

In [None]:
# For the normal operation