In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import os
import warnings
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop(['Unnamed: 0', 'lot', 'vin', 'condition'], inplace=True, axis=1)
for x in df[df['price']<=100].index:
    df.drop(x, axis=0, inplace=True)

In [None]:
brand_count = df['brand'].value_counts().to_frame().reset_index()
brand_count.rename(columns={'index':'brand', 'brand':'count'}, inplace=True)
brand_count['percentages'] = brand_count['count']/sum(brand_count['count'])*100

fig = plt.figure(figsize=(10, 8))

axes = fig.add_axes([0, 0, 1, 1])

sns.barplot(y = brand_count['brand'], x = brand_count['count'], color = '#50c878', ax=axes)

plt.yticks(color='black', fontsize=14)
plt.xticks([])

axes.set_xlabel('')
axes.set_ylabel('')

axes.spines[['right', 'top', 'bottom']].set_visible(False)

for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.0f}'.format(width), (50+ width, y + height*0.8), ha='center', fontsize=16, color='black')

axes.axhline(4.51, linestyle = ':', color = 'goldenrod', xmin = 0, xmax = 1220, lw=4)

axes.text(700, 2.5, 
          f'''Top 5 brands by count - 
{round(sum(brand_count['percentages'][:5]),2)}% of the total''', 
          fontweight='bold', color='black', fontsize=16)
axes.text(-300, -2.2, 'Number of cars by brand', color='black', fontsize=24, fontweight='bold')
    
fig.show()

In [None]:
fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.violinplot(x=df['price'], color = '#50c878', ax=axes)
sns.swarmplot(x=df['price'], color = 'black', ax=axes)

axes.spines[['right', 'top', 'left']].set_visible(False)

axes.set_xlabel('Price', color='black', fontsize=16)

axes.set_xticks(range(0,100000, 5000))

axes.text(65000, -0.4, f"~ {round(len(df[df['price']>=5000][df[df['price']>=5000]['price']<=25000])/len(df)*100)}% cars cost in range 5k - 25k", fontsize=16, color='black', fontweight='bold')
axes.text(65000, -0.3, 'Most expensive car is:', fontsize=16, color='black', fontweight='bold')
axes.text(65000, -0.1, 
          f'''{df.sort_values(by='price', ascending=False).reset_index()['brand'][0]} {df.sort_values(by='price', ascending=False).reset_index()['model'][0]},
registration year - {df.sort_values(by='price', ascending=False).reset_index()['year'][0]},
color - {df.sort_values(by='price', ascending=False).reset_index()['color'][0]}, mileage - {df.sort_values(by='price', ascending=False).reset_index()['mileage'][0]},
price - {df.sort_values(by='price', ascending=False).reset_index()['price'][0]}.''', 
          fontsize=16, color='black')

axes.text(65000, 0.1, 'Least expensive car is:', fontsize=16, color='black', fontweight='bold')
axes.text(65000, 0.3, 
          f'''{df.sort_values(by='price').reset_index()['brand'][0]} {df.sort_values(by='price').reset_index()['model'][0]},
registration year - {df.sort_values(by='price').reset_index()['year'][0]},
color - {df.sort_values(by='price').reset_index()['color'][0]}, mileage - {df.sort_values(by='price').reset_index()['mileage'][0]},
price - {df.sort_values(by='price').reset_index()['price'][0]}''', 
          fontsize=16, color='black')
axes.text(-10000, -0.5, 'Price distribution', color='black', fontsize=24, fontweight='bold')

fig.show()

In [None]:
max_price_brand = df.groupby('brand')['price'].max().to_frame().reset_index().sort_values(by='price', ascending=False)
mean_price_brand = df.groupby('brand')['price'].mean().to_frame().reset_index().sort_values(by='price', ascending=False)

fig = plt.figure(figsize=(15,8))

axes = fig.add_axes([0, 0, 1, 1])

sns.barplot(y=max_price_brand['brand'], x=max_price_brand['price'], color='#50c878', ax=axes)

plt.yticks(color='black', fontsize=14)
plt.xticks([])

axes.set_xlabel('')
axes.set_ylabel('')

axes.spines[['right', 'top', 'bottom']].set_visible(False)

for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.0f}'.format(width), (3500 + width, y + height*0.8), ha='center', fontsize=16, color='black')
    
axes.text(77000, 6, 
          f'''          Brands of the 3 most expensive cars:
          - {max_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][0]};
          - {max_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][1]};
          - {max_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][2]}.''', color='black', fontsize=16, fontweight='bold')
axes.text(-10000, -2.2, 'Maximum price by brands', color='black', fontsize=24, fontweight='bold')
fig.show()

In [None]:
min_price_brand = df.groupby('brand')['price'].min().to_frame().reset_index().sort_values(by='price', ascending=False)

fig = plt.figure(figsize=(15,8))

axes = fig.add_axes([0, 0, 1, 1])

sns.barplot(y=min_price_brand['brand'], x=min_price_brand['price'], color='#50c878', ax=axes)

plt.yticks(color='black', fontsize=14)
plt.xticks([])

axes.set_xlabel('')
axes.set_ylabel('')

axes.spines[['right', 'top', 'bottom']].set_visible(False)

for p in axes.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    axes.annotate('{:.0f}'.format(width), (1500 + width, y + height*0.8), ha='center', fontsize=14, color='black')
    
axes.text(40000, 5, 
          f'''          Brands of the 3 least expensive cars:
          - {min_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][27]};
          - {min_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][26]};
          - {min_price_brand.sort_values(by='price', ascending=False).reset_index()['brand'][25]}.''', color='black', fontsize=16, fontweight='bold')
axes.text(-10000, -2.2, 'Minimum price by brands', color='black', fontsize=24, fontweight='bold')
fig.show()


In [None]:
fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.stripplot(x=df['brand'], y=df['price'], ax=axes, color='#50c878', size=6, linewidth=1, edgecolor='black')

axes.spines[['left', 'top', 'right']].set_visible(False)

axes.set_xlabel('Brand', color='black', fontsize=16)
axes.set_ylabel('Price', color='black', fontsize=16)

plt.xticks(rotation=55, fontsize=14, color='black')
plt.yticks(fontsize=14, color='black')

axes.text(-3, 105000, 'Price by brands', color='black', fontsize=24, fontweight='bold')

fig.show()

In [None]:
fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.distplot(x=df['mileage'], color='#50c878', hist_kws = dict(alpha=0.9, edgecolor='black', lw=3),
             kde_kws=dict(color='goldenrod', lw=3), bins=80)

axes.set_xlabel('Mileage', color='black', fontsize=16)
axes.set_ylabel('')

axes.text(280000, 0.000015, 
          f'''~ {round(len(df[df['mileage']<=50000])/len(df)*100)} % cars have mileage in range 0 - 50k''', fontsize=16, color='black', fontweight='bold')

axes.set_xticks(range(0,1100000, 50000))
axes.set_yticks([])

axes.text(-110000, 0.000023, 'Mileage', color='black', fontsize=24, fontweight='bold')

axes.spines[['right', 'top', 'left']].set_visible(False)

fig.show()

In [None]:
fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.scatterplot(x=df['mileage'], y=df['price'], color='#50c878')

axes.spines[['right', 'top', 'left']].set_visible(False)

axes.set_xticks(range(0,1100000, 50000))
axes.set_yticks(range(0,100000, 10000))

axes.set_xlabel('Mileage', color='black', fontsize=16)
axes.set_ylabel('Price', color='black', fontsize=16)

axes.text(300000, 80000, 'We observe not strong inverse relationship of price and mileage.', fontsize=16, color='black', fontweight='bold')
axes.text(-110000, 105000, 'Price by mileage', color='black', fontsize=24, fontweight='bold')

fig.show()

In [None]:
year_count = df['year'].value_counts().to_frame().reset_index()
year_count.rename(columns={'index':'year', 'year':'count'}, inplace=True)
year_count['percentages'] = year_count['count']/sum(year_count['count'])*100

fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.lineplot(x=year_count['year'], y=year_count['count'], ax=axes, color='#50c878', lw=3)
axes.scatter(x=year_count['year'], y=year_count['count'], color='black', lw=6)

axes.set_xlabel('Year', color='black', fontsize=16)
axes.set_ylabel('Count', color='black', fontsize=16)

axes.spines[['right', 'top']].set_visible(False)

plt.xticks(range(year_count['year'].min(), year_count['year'].max()), rotation=45)
plt.yticks(range(0, year_count['count'].max(), 50))

axes.text(1990, 770, 
          f'''~ {round(sum(year_count['percentages'][:3]))} % cars were registered in the period 2017-2019
          
First car was registred in {df['year'].min()}. This car is
{df.sort_values(by='year').reset_index()['color'][0]} {df.sort_values(by='year').reset_index()['brand'][0]} {df.sort_values(by='year').reset_index()['model'][0]} from {df.sort_values(by='year').reset_index()['state'][0]}
and it's priced {df.sort_values(by='year').reset_index()['price'][0]}.''', fontsize=16, color='black', fontweight='bold')

axes.text(1966, 1050, 'Number of cars by year', color='black', fontsize=24, fontweight='bold')

fig.show()

In [None]:
fig = plt.figure(figsize=(15, 5))

axes = fig.add_axes([0, 0, 1, 1])

sns.scatterplot(x=df['year'], y=df['price'], color='#50c878')

axes.spines[['right', 'top', 'left']].set_visible(False)

plt.xticks(range(year_count['year'].min(), year_count['year'].max()), rotation=45)
plt.yticks(range(0,100000, 10000))

axes.set_xlabel('Year', color='black', fontsize=16)
axes.set_ylabel('Price', color='black', fontsize=16)

axes.text(1985, 80000, 'We observe not strong relationship of price and year.', fontsize=16, color='black', fontweight='bold')
axes.text(1966, 105000, 'Price by year', color='black', fontsize=24, fontweight='bold')

fig.show()