# Explorative Data Analysis

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
import shap
import eli5
from collections import Counter

import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

shap.initjs()

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/usedcarscatalog/cars.csv')

In [None]:
df.shape

In [None]:
df.info()

## odometer_value
One of the very important parameters for the used car is the odometer state which shows how many kilometers the car has been driven over the years in total. There is a feature for each car called odometer_value which shows exactly that.

In [None]:
feature='odometer_value'
plt.figure(figsize=(16,10))
plt.xlim(df[feature].min()-10000, df[feature].max()+10000)
sns.distplot(df.loc[df[feature]<1200000, feature],
             hist=True, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3},
             hist_kws={'rwidth':0.9},
             label=feature,
             color='green', 
             axlabel=feature)
plt.title(f"{feature.capitalize()}")
plt.show()

print(f'Mean value for {feature}: {df[feature].mean()}')
print(f'Median value for {feature}: {df[feature].median()}')
print(f'Min value for {feature}: {df[feature].min()}')
print(f'Max value for {feature}: {df[feature].max()}')

In [None]:
feature='odometer_value'
plt.figure(figsize=(16,10))
plt.xlim(df[feature].min()-10000, 710000)
sns.distplot(df.loc[df[feature]<1200000, feature],
             hist=True, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3},
             hist_kws={'rwidth':0.9},
             label=feature,
             color='green', 
             axlabel=feature)
plt.title(f"{feature.capitalize()}")
plt.show()

We love nice numbers: 200k, 250k, 300k, 350k, 400k, 450k, 500k! There are also plenty of cars with 0 odometer state (brand new).

## year_produced


In [None]:
feature='year_produced'
plt.figure(figsize=(12,8))
df[feature].value_counts().reindex(sorted(df[feature].unique()))\
                .plot(kind='bar', 
                      xticks=df[feature].unique())
plt.show()

## price
price is going to be the target value during model building process.

In [None]:
feature='price_usd'
plt.figure(figsize=(10,7))
plt.xlim(-1000, df[feature].max())
sns.distplot(df.loc[df[feature]<200000, feature],
             hist=True, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3},
             hist_kws={'rwidth':0.9},
             label=feature,
             color='green', 
             bins=90,
             axlabel=feature)
plt.title(f"{feature.capitalize()}")
plt.show()

print(f'Mean value for {feature}: {df[feature].mean()}')
print(f'Median value for {feature}: {df[feature].median()}')
print(f'Min value for {feature}: {df[feature].min()}')
print(f'Max value for {feature}: {df[feature].max()}')

In [None]:
feature='price_usd'
plt.figure(figsize=(10,7))
plt.xlim(-1000, 21000)
sns.distplot(df.loc[df[feature] < 21000, feature],
             hist=True, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3},
             hist_kws={'rwidth':0.9},
             label=feature,
             color='green', 
#              bins=50,
             axlabel=feature)
plt.title(f"{feature.capitalize()}")
plt.show()

## drivetrain

In [None]:
feature='drivetrain'
plt.figure(figsize=(5,5))
df[feature].value_counts()\
                .plot(kind='bar')
plt.show()

In [None]:
plt.figure(figsize=(4.5, 7))
plt.ylim(-1000, 40000)
ax = sns.boxplot(x=feature, 
                 y='price_usd', 
                 data=df, 
                 linewidth=1.5,
                 order=["front", "rear", "all"])

## engine_type

In [None]:
cat_feature = 'engine_type'
numerical_feature = 'price_usd'

df[cat_feature].describe()

In [None]:
plt.figure(figsize=(4.5, 8))
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5,
                 order=["gasoline", "diesel", "electric"])

In [None]:
plt.figure(figsize=(5,5))
plt.figure(figsize=(4.5, 6))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

In [None]:
df.loc[df[cat_feature]=='electric']\
    [['manufacturer_name', 'price_usd']].sample(5)

## engine_capacity

In [None]:
feature = 'engine_capacity'
df[feature].describe()

In [None]:
feature='engine_capacity'
plt.figure(figsize=(10,7))
plt.xlim(0.5, 8)
sns.distplot(df.loc[df[feature]<10, feature],
             hist=True, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3},
             hist_kws={'rwidth':0.9},
             label=feature,
             color='green', 
             axlabel=feature)
plt.title(f"{feature.capitalize()}")
plt.show()

print(f'Mean falue for {feature}: {df[feature].mean()}')
print(f'Median falue for {feature}: {df[feature].median()}')

## is_exchangeable
is_exhangeable parameter indicates if the person is willing to participate in exchanging the vehicle.

In [None]:
df[cat_feature].describe()

In [None]:
df.loc[df[cat_feature]==True].shape[0]

In [None]:
cat_feature = 'is_exchangeable'
numerical_feature = 'price_usd'


plt.figure(figsize=(4.5, 8))
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5)

In [None]:
plt.figure(figsize=(5,5))
plt.figure(figsize=(4.5, 6))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

## up_counter
Indicates how many times the sample has been upped in the catalog to raise its position.

In [None]:
feature = 'up_counter'
df[feature].describe()

The distribution is highly skewed to the right.

In [None]:
feature='up_counter'
plt.figure(figsize=(10,5))
hist, bins, _ = plt.hist(df[feature], bins=20, rwidth=0.9)
plt.ylabel('Number of cars')
plt.xlabel(feature)
plt.show()

print(f'Mean falue for {feature}: {df[feature].mean()}')
print(f'Median falue for {feature}: {df[feature].median()}')

Plot using logarithmic scale.

In [None]:
logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),20)
plt.figure(figsize=(10,5))
plt.hist(df[feature], bins=logbins, rwidth=0.9)
plt.xscale('log')
plt.ylabel('Number of cars')
plt.xlabel('log(up_counter)')
plt.show()

## manufacturer_name

In [None]:
feature = 'manufacturer_name'
df['brand_count'] = df.groupby(feature)[feature].transform('count')

In [None]:
plt.figure(figsize=(8,5))
df.loc[df['brand_count'] > 200]['manufacturer_name'].value_counts().plot(kind='bar')
plt.ylabel('Number of cars')
plt.xlabel(feature)
plt.show()

## transmission

In [None]:
cat_feature = 'transmission'
numerical_feature = 'price_usd'


plt.figure(figsize=(4.5, 8))
plt.ylim(-1000, 60000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5,
                 order=['mechanical', 'automatic'])

In [None]:
plt.figure(figsize=(5,5))
plt.figure(figsize=(4.5, 6))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

## model_name

In [None]:
models_dict = df['model_name'].value_counts().to_dict()
print(len(models_dict), 'models in total!\n\n')

from itertools import islice

for model in list(islice(models_dict, 20)):
    print(model, '---->', models_dict[model], 'cars')

The most popular model is VW Passat, as expected!

## color

In [None]:
df['color'].describe()

In [None]:
cat_feature = 'color'
numerical_feature = 'price_usd'


plt.figure(figsize=(10, 8))
plt.ylim(-1000, 60000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5,
                 order=['black','silver','blue', 'white',
                        'grey', 'red', 'green', 'other', 
                        'brown', 'violet', 'yellow', 'orange'])

In [None]:
plt.figure(figsize=(8, 6))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

> ## engine_fuel

In [None]:
cat_feature = 'engine_fuel'
numerical_feature = 'price_usd'


plt.figure(figsize=(8, 6))
plt.ylim(-1000, 60000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5)

In [None]:
df.loc[df[cat_feature]=='electric']\
    [['manufacturer_name', 'model_name', 'price_usd', 'year_produced']]

## engine_has_gas
*engine_has_gas == True* signifies that car has been modified in a way that it can use propane as a fuel, this means that there is an additional propane reservoir, usually in the trunk, and a bunch of additional tubes so the gas can flow into the engine.

In [None]:
cat_feature = 'engine_has_gas'
numerical_feature = 'price_usd'


plt.figure(figsize=(8, 6))
plt.ylim(-1000, 60000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5)

In [None]:
plt.figure(figsize=(5, 4))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

In [None]:
df.loc[df['engine_has_gas'] == True]['engine_has_gas'].value_counts()

In [None]:
df.loc[df['engine_has_gas'] == False]['engine_has_gas'].value_counts()

Less than 4% of the cars have gas equipment installed.

## body_type

In [None]:
cat_feature = 'body_type'
numerical_feature = 'price_usd'


plt.figure(figsize=(10, 6))
plt.ylim(-1000, 40000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5,
                 order=['sedan', 'hatchback', 'universal',
                       'suv', 'minivan', 'minibus', 'van', 
                       'coupe', 'liftback', 'pickup',
                       'cabriolet', 'limousine'])

In [None]:
plt.figure(figsize=(10, 5))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

## has_warranty

In [None]:
cat_feature = 'has_warranty'
numerical_feature = 'price_usd'


plt.figure(figsize=(4, 6))
plt.ylim(-1000, 100000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5)

In [None]:
plt.figure(figsize=(4, 5))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

In [None]:
df.loc[df[cat_feature]==True]\
    [['manufacturer_name', 'model_name', 'year_produced', 'price_usd']]\
    .sample(7)

## state

In [None]:
cat_feature = 'state'
numerical_feature = 'price_usd'

plt.figure(figsize=(4, 6))
plt.ylim(-1000, 100000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5)

In [None]:
plt.figure(figsize=(4, 5))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

In [None]:
brands_dict = df[cat_feature].value_counts().to_dict()
for index, key in enumerate(brands_dict): 
    print(key, '----->', brands_dict[key], 'cars')

## location_region

In [None]:
cat_feature = 'location_region'
numerical_feature = 'price_usd'


plt.figure(figsize=(10, 8))
plt.ylim(-1000, 25000)
ax = sns.boxplot(x=cat_feature, 
                 y=numerical_feature, 
                 data=df, linewidth=1.5,
                 order=['Минская обл.', 'Гомельская обл.', 
                        'Витебская обл.', 'Брестская обл.',
                        'Могилевская обл.', 'Гродненская обл.'])

In [None]:
plt.figure(figsize=(10, 5))
df[cat_feature].value_counts()\
                .plot(kind='bar')
plt.show()

## number_of_photos
Expensive cars presumably have higher price because there is more to show.

In [None]:
feature='number_of_photos'
plt.figure(figsize=(12,8))
df[feature].value_counts().reindex(sorted(df[feature].unique()))\
                .plot(kind='bar', 
                      xticks=df[feature].unique())
plt.show()

If we make a scatter plot to check the correlation between number of photos and price_usd we get this:

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(df['price_usd'], 
            df['number_of_photos'], 
            s=3,
            )
plt.xlabel('price_usd')
plt.ylabel('number_of_photos')
plt.xlim([0, 75000])
plt.ylim([0, 60])
plt.show()

Scatter plot gives a wrong view of the data, if we make a densiyt plot, we can see the region responsible for the vast majority of the samples.

In [None]:
plt.figure(figsize=(10,10))
plt.hist2d(df['price_usd'], 
           df['number_of_photos'], 
           (300,300),cmap=plt.cm.jet)
plt.colorbar()
plt.xlim([0, 50000])
plt.ylim([0, 60])
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist2d(df['price_usd'], 
           df['number_of_photos'], 
           (600,600),cmap=plt.cm.jet)
plt.colorbar()
plt.xlim([0, 20000])
plt.ylim([1, 20])
plt.show()

## duration_listed

In [None]:
feature='duration_listed'
plt.figure(figsize=(12,10))
plt.xlim(df[feature].min(), df[feature].max())
df[feature].plot(kind="hist", rwidth=0.92, 
                 bins=100,
                 color='red')
plt.title(f"{feature.capitalize()}")
plt.tight_layout()
plt.show()

In [None]:
feature='duration_listed'
plt.figure(figsize=(12,10))
plt.xlim(df[feature].min(), 250)
df.loc[df[feature]<250][feature].plot(kind="hist", rwidth=0.9, 
                 bins=80,
                 color='red')
plt.title(f"{feature.capitalize()}")
plt.tight_layout()
plt.show()

## feature_0 - feature_9
indicate that there is a feature present in the car like air conitioner, alloys, xenon lamps, etc.

In [None]:
df['feature_0'].describe()

In [None]:
df['feature_1'].describe()

In [None]:
df['feature_3'].describe()

In [None]:
df['feature_4'].describe()

In [None]:
df['feature_5'].describe()

In [None]:
df['feature_6'].describe()

In [None]:
df['feature_7'].describe()

In [None]:
df['feature_8'].describe()

In [None]:
df['feature_9'].describe()

## Conlusion
That was a pretty basic exploratory analysis of the dataset. 