# Used Cars Data Analysis and Visualization (EDA)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization tools
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium.plugins import HeatMap
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Reading Data

In [None]:
df=pd.read_csv("/kaggle/input/craigslist-carstrucks-data/vehicles.csv")

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Missingno - Missing Data

# Data Cleaning
### Removed unnecessary

In [None]:
df.columns

In [None]:
df.drop(columns=['url','image_url','VIN'],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
cars = df[['id', 'price', 'year', 'manufacturer', 'model', 'odometer', 'type', 'lat', 'long']]#we will only look at these columns in the analysis
cars['age'] = 2020 - cars['year'] #add an age column
cars.columns

In [None]:
cars.info()

In [None]:
def get_missing_info(df):
    num_entries = df.shape[0]*df.shape[1]
    null_entries = df.isnull().sum().sum()
    percent_empty = null_entries/num_entries*100
    num_missing = df.isna().sum()
    percent_missing = num_missing/len(df)*100
    col_modes = df.mode().loc[0]
    percent_mode = [df[x].isin([df[x].mode()[0]]).sum()/len(df)*100 for x in df]
    missing_value_df = pd.DataFrame({'num_missing': num_missing,
                                     'percent_missing': percent_missing, 
                                     'mode': col_modes,
                                     'percent_mode':percent_mode})
    print('total empty percent:', percent_empty, '%')
    print('columns that are more than 97% mode:', missing_value_df.loc[missing_value_df['percent_mode']>97].index.values)
    return(missing_value_df)
get_missing_info(cars)

In [None]:
import math
def modef(x):#get mode of groupby row
    m = pd.Series.mode(x)
    if len(m)==1: 
        return m
    if len(m)==0:
        return 'unknown'
    else: return m[0]

def isnan(x):#check if entry is nan
    try:
        out = math.isnan(float(x))
    except:
        out = False
    return(out)

def fill_type(x):#fill type column with mode of model columns
    if isnan(x['type']):
        try:
            out = model_types[x['model']] 
        except:
            out = 'unknown'
    else:
        out = x['type']
    return(out)

model_types = cars.groupby(['model'])['type'].agg(modef)
cars['type'] = cars.apply(fill_type, axis=1)

In [None]:
cars.info()

In [None]:
cars.year.drop_duplicates()

In [None]:
cars.columns.tolist()

In [None]:
filled_mode = cars.columns.tolist()
for x in filled_mode:
    cars[x] = cars[x].fillna(cars[x].mode()[0])

In [None]:
cars.isnull().sum()

# Seaborn - Bar Plot

In [None]:
# cat_type = ['manufacturer', 'price', 'model', 'type']
# fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(18, 10)) 

# for i, var in enumerate(cat_type):
#     row = i//2
#     pos = i % 2    
#     plot = sns.countplot(x=var, data=cars, order = cars[var].value_counts().index, ax=axs[row][pos])
#     var = plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
# fig.tight_layout(pad=2.0)

In [None]:
df=df.sort_values(by=['odometer'],ascending=False)
plt.figure(figsize=(25,15))
sns.barplot(x=df.manufacturer, y=df.odometer)
plt.xticks(rotation= 90)
plt.xlabel('Manufacturer')
plt.ylabel('Odometer')
plt.show()

# scatter

In [None]:
xs = cars['odometer']
ys = cars['age']
zs = cars['price']
#plot the raw data
fig, axs = plt.subplots(1, 1, figsize=(12,10))
axs.scatter(xs, ys, c=zs)
axs.set_xlim(0,6e5)
axs.set_title('raw data')


# box plot

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y='price', data=df,showfliers=False);

# heatmap

In [None]:
manf_ser = cars[cars.year>2000].groupby('year').manufacturer.value_counts()
manf_ser_df = pd.DataFrame(manf_ser.unstack())

plt.subplots(figsize=(12, 7))
sns.heatmap(manf_ser_df, cmap='Blues', linecolor='white', linewidth=1)