# UC SD Micromasters - Python for Data Science
# Craigslist Used Cars Dataset

In [None]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Import and read dataset

In [None]:
# First, look at downloaded file from https://www.kaggle.com/austinreese/craigslist-carstrucks-data/download.
from subprocess import check_output
print(check_output(["ls", "./data"]).decode("utf8"))

In [None]:
# how to from https://www.kaggle.com/mchirico/how-to-read-datasets
# unzip file
import zipfile
Dataset = "craigslist-carstrucks-data"

# Will unzip the files so that you can see them..
with zipfile.ZipFile("./data/"+Dataset+".zip","r") as z:
    z.extractall("./data")

In [None]:
# import data as pandas dataframe
cars_raw = pd.read_csv("./data/vehicles.csv")

In [None]:
# dataframe shape
print("Number of rows:",cars_raw.shape[0])
print()
print("Number of columns",cars_raw.shape[1])

## Clean data set

In [None]:
# portion of null values as percent of total number of rows
no_null_prc = (cars_raw.count()/len(cars_raw)).round(2)*100
no_null_prc

In [None]:
# select columns of interest that require further analysis
cols_to_use = list(no_null_prc[no_null_prc>=80].index)
cols_to_use

In [None]:
cols_not_needed = list(cars_raw.columns[0:4])
cols_not_needed.extend(['description','image_url'])
cols_not_needed

In [None]:
cars = cars_raw[[name for name in cols_to_use if name not in cols_not_needed]]
cars = cars.dropna()
cars.sample(10)

## Exploratory Data Analysis

In [None]:
# distribution of price : mean vale is $19,500 ; max value is $4.3 billion!
cars.describe(percentiles=[0.25,0.5,0.75,0.9]).transpose()

In [None]:
# remove vehicles with high prices
cars = cars[(cars.price<=30000) & (cars.odometer<=180000)]

In [None]:
# box plot of price
sns.boxplot(cars.price);

In [None]:
# there are 36189 vehicles with prices = 0. could impact modelling later on
len(cars[cars.price == 0])

In [None]:
# remove data with prices = 0
cars = cars[cars.price>0]

In [None]:
# box plot of odometer
sns.boxplot(cars.odometer);

In [None]:
# a large majority of values in transmission are laballed as  "other"
cars.transmission.value_counts()

In [None]:
# assume transmission labeled "other" is automatic
cars['transmission'] = np.where(cars.transmission == "other","automatic",cars.transmission)

In [None]:
# large majority of `title_status` values are either "clean" or "rebuilt" or "salvage" or "lien"
cars.title_status.value_counts()

In [None]:
# remove 'missing' and 'parts only'
keep = ['clean','rebuilt','salvage','lien']
cars = cars[cars['title_status'].isin(keep)]

In [None]:
sns.boxplot(x='fuel',y='price',data=cars).set_title('Used Cars Dataset: fuel type');

In [None]:
# vehicles with fuel type as  "other" can be removed from the analysis
cars.fuel.value_counts()

In [None]:
# look at data between 1990 and 2019
grouped_by_year = cars.groupby('year').count().reset_index()
sns.lineplot(x='year',y='price',data=grouped_by_year).set_title('Used Cars Dataset : number of listings vs. time');

In [None]:
# keep listings between 1999 and 2019
cars = cars[(cars.year>=1999) & (cars.year<=2019)]

In [None]:
# vehicles with an automatic transmission tend to be at slightly higher price
sns.boxplot(x='transmission',y='price',data=cars).set_title('Used Cars Dataset: transmission type');

In [None]:
# interestingly, vehicles with a lien tend to be higher priced
sns.boxplot(x='title_status',y='price',data=cars).set_title('Used Cars Dataset: status title');