In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pylab as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
df.nunique(axis=0)

In [None]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
df.condition.unique()


In [None]:
# Reclassify condition column

def clean_condition(row):
    
    good = ['good','fair']
    excellent = ['excellent','like new']       
    
    if row.condition in good:
        return 'good'   
    if row.condition in excellent:
        return 'excellent'    
    return row.condition# Clean dataframe
def clean_df(playlist):
    df_cleaned = df.copy()
    df_cleaned['condition'] = df_cleaned.apply(lambda row: clean_condition(row), axis=1)
    return df_cleaned# Get df with reclassfied 'condition' column
df_cleaned = clean_df(df)
print(df_cleaned.condition.unique())

In [None]:
df_cleaned.shape

##Removing Redundant variables

In [None]:
df_cleaned = df_cleaned.copy().drop(['url','region_url','image_url'],axis=1)

In [None]:
df_cleaned.isna().sum()

In [None]:
df_cleaned.shape[0]

In [None]:
#I used the following code to remove any columns that had 40% or more of its data as null values. 
NA_val = df_cleaned.isna().sum()
def na_filter(na, threshold = .4): #only select variables that passees the threshold
    col_pass = []
    for i in na.keys():
        if na[i]/df_cleaned.shape[0]<threshold:
            col_pass.append(i)
    return col_pass
df_cleaned = df_cleaned[na_filter(NA_val)]
df_cleaned.columns

In [None]:
df_cleaned.shape

**Removing Outliers**

In [None]:
(df_cleaned['year'] > 1990).head()

In [None]:
df_cleaned = df_cleaned[df_cleaned['price'].between(999.99, 99999.00)]
df_cleaned = df_cleaned[df_cleaned['year'] > 1990]
df_cleaned = df_cleaned[df_cleaned['odometer'] < 899999.00]
df_cleaned.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

**Removing Rows with Null Values**

In [None]:
df_cleaned.shape

In [None]:
df_cleaned = df_cleaned.dropna(axis=0)
df_cleaned.shape

**Analyzing relationships between variables**

In [None]:
df_cleaned.corr()

In [None]:
#I used sns.heatmap() 
#to plot a correlation matrix of all of the variables in the used car dataset

# calculate correlation matrix
corr = df_cleaned.corr()
# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

We can see that there is a positive correlation between price and year and a negative correlation between price and odometer. This makes sense as newer cars are generally more expensive, and cars with more mileage are relatively cheaper. We can also see that there is a negative correlation between year and odometer — the newer a car the less number of miles on the car.

In [None]:
#Scatter plot
df_cleaned.plot(kind='scatter', x='odometer', y='price')

You can see this as the plots show a steep drop at first, but becomes less steep as more mileage is added. This is why people say that it’s not a good investment to buy a brand new car!

In [None]:
df_cleaned.plot(kind='scatter', x='year', y='price')

the scatterplot above shows the relationship between year and price — the newer the car is, the more expensive it’s likely to be.

In [None]:
#sns.pairplot() is a great way to create scatterplots 
#between all of your variables

sns.pairplot(df_cleaned)

**Histogram**
you only wanted to explore a single variable by itself

In [None]:
df_cleaned['odometer'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')


In [None]:
df_cleaned['year'].plot(kind='hist', bins=20, figsize=(12,6), facecolor='grey',edgecolor='black')

**Boxplot**

Another way to visualize the distribution of a variable is a boxplot.

In [None]:
df_cleaned.boxplot('price')

 Immediately, you can see that there are a number of outliers for price in the upper range and that most of the prices fall between 0 and $40,000.