# Loading Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
import warnings
warnings.filterwarnings("ignore")        
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.gridspec as grid_spec
import seaborn as sns

# Handle Null data

Many null on 
* 'Vehicle_version', 
* 'Vehicle_generation', 
* 'CO2_emissions', 
* 'Drive',
* 'Origin_country', 
* 'First_owner',
* 'First_registration_date' 
These columns will be dropped for predict

Some null on  
* 'Mileage_km', 
* 'Power_HP', 
* 'Displacement_cm3', 
* 'Transmission', 
* 'Doors_number'
row, which has null, will be deleted for EDA


In [None]:
df = pd.read_csv('/kaggle/input/poland-cars-for-sale-dataset/Car_sale_ads.csv')

plt.figure(figsize=(9,3),dpi=150)
sns.heatmap(df.isnull())

In [None]:
df_car = df.drop(['Vehicle_version', 'Vehicle_generation', 'CO2_emissions', 'Drive','Origin_country', 'First_owner','First_registration_date'], axis=1)
df_car = df_car.dropna(axis=0)
print("Count of null : ", df_car.isnull().sum().sum())

In [None]:
df_car["date"] = pd.to_datetime(df_car['Offer_publication_date'], dayfirst=True) # change Dtype from object to datetime64
df_car["year_month"] = df_car['date'].dt.strftime('%Y-%m')
print('Offer publication date from :',df_car['date'].min(), "// to :" ,df_car['date'].max(), "// format : yyyy-mm-dd hh:mm:ss")

# Currency (PLN & EUR) to USD

246 offers were 'Euro(EUR)', rest of offers are Polish Zloty (PLN)

In [None]:
print(df_car[df_car['Currency'] != 'PLN']['Currency'].unique(), df_car[df_car['Currency'] != 'PLN']['Currency'].count())

Average currecy rate (based on May 17, 2021) :

USD per 1 Polish Zloty (PLN)
* Mar 0.258995 – 31 days
* Apr 0.261873 – 30 days
* May 0.265475 – 17 days

USD per 1 Euro (EUR)
* Mar 1.191048 – 31 days
* Apr 1.195110 – 30 days
* May 1.209037 – 17 days



In [None]:
Curr_rate = pd.DataFrame(
    {'Currency':['PLN', 'PLN', 'PLN' , 'EUR', 'EUR', 'EUR'],
     'year_month':['2021-03','2021-04','2021-05','2021-03','2021-04','2021-05'],
     'To_USD':[0.258995, 0.261873, 0.265475, 1.191048, 1.195110, 1.209037] })

df_car_USD =pd.merge(df_car, Curr_rate, left_on=['Currency', 'year_month'], right_on=['Currency', 'year_month'], how='left')
df_car_USD['Price_USD'] = round((df_car_USD['Price'] * df_car_USD['To_USD']), 2)

# Top 20 Brand

In [None]:
many_sell = df_car_USD['Vehicle_brand'].value_counts()[:20].sort_values(ascending=True)

In [None]:
fig = plt.figure(figsize=(6,6),dpi=150)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0.05, hspace=0.27)
ax0 = fig.add_subplot(gs[0, 0])

background_color = "#f7f5f7" #RGBA code - https://www.hexcolortool.com/
high_c = '#133b81'
mid_c = '#57a2ff'
plt.rcParams["font.family"] = "monospace"

fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

data = many_sell

color_map = ['#e7e9e7' for _ in range(20)] # 일단 칼라맵을 20개 만들어서 전부다 같은 회색으로 칠하고
color_map[15] = color_map[16] = color_map[17] = color_map[18] = color_map[19] = high_c # color highlight 하위 3개에 대해 하이라이트 용
color_map[10] = color_map[11] = color_map[12] = color_map[13] = color_map[14] = mid_c # color highlight 하위 3개에 대해 하이라이트 용

ax0.barh(data.index, data, 
       edgecolor='darkgray',color=color_map)

for i in range(0,10):
    ax0.annotate(list(data.index)[i],  # 주석달기 = data의 인덱스는 국가명이다.
                   xy=(data[i]-(data[i]*0.01), i),  #위치 알려주기 x축은 약간 왼쪽으로 y 축은 역순인가보다
                   va = 'center', ha='right',fontweight='light', fontfamily='monospace',fontsize=6, color='gray',rotation=0)
    
for i in range(10,15):
    ax0.annotate(list(data.index)[i],  # 주석달기 = data의 인덱스는 국가명이다.
                   xy=(data[i]-(data[i]*0.01), i),  #위치 알려주기 x축은 약간 왼쪽으로 y 축은 역순인가보다
                   va = 'center', ha='right',fontweight='light', fontfamily='monospace',fontsize=7, color='black',rotation=0)

for i in range(15,20):
    ax0.annotate(list(data.index)[i], 
                   xy=(data[i]-(data[i]*0.01), i), 
                   va = 'center', ha='right',fontweight='bold', fontfamily='monospace',fontsize=8, color='white',rotation=0)

plt.xticks(fontsize = 5)
ax0.axes.get_yaxis().set_ticks([])

for s in ['top', 'bottom', 'right']:
    ax0.spines[s].set_visible(False)
    
ax0.text(0,21,'The most advertised brand in the used car market', fontfamily='sans-serif',fontsize=12,fontweight='bold',color='#323232')
ax0.text(0,20.45,'Poland, from Mar 26 to May 5 2021',fontfamily='sans-serif',fontsize=8,fontweight='light',color='#303030')

plt.show()

In [None]:
Brand_top10 = many_sell.index.tolist()[::-1][:10]

In [None]:
Price_dist = df_car_USD[df_car_USD['Vehicle_brand'].isin(Brand_top10)]
Price_dist['Vehicle_brand_cat'] = pd.Categorical(Price_dist['Vehicle_brand'], categories=Brand_top10, ordered=True)
Price_dist = Price_dist.sort_values('Vehicle_brand_cat')

# Price Distribution on Top 10 Brand

In [None]:
sns.set_theme(style="ticks")

f, ax = plt.subplots(figsize=(6, 5),dpi=150)
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="Price_USD", y="Vehicle_brand", data=Price_dist,
            whis=[0, 100], width=0.5, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="Price_USD", y="Vehicle_brand", data=Price_dist,
              size=0.3, color="0.5", linewidth=0)

# Tweak the visual presentation
plt.xticks(fontsize = 7)
plt.xlabel("")

ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

# Correlation

Price correlated with 
1. Horse Power (it is also highly correlated with Displacement_cm3)
2. Prodction year 

In [None]:
sns.set_theme(style="white")


# Compute the correlation matrix
corr = Price_dist.drop(['Price','Index','To_USD'], axis=1).corr()
#corr = Price_dist[Price_dist['Vehicle_brand']=='Volkswagen'].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="vlag", vmax=1, center=0,
            annot=True, square=True, linewidths=0.5, cbar_kws={"shrink": .5})

# Plot : Horse Power vs. Price

In [None]:
sns.set_theme(style="ticks")

# Show the results of a linear regression within each dataset
plot = sns.lmplot(x="Power_HP", y="Price_USD", col="Vehicle_brand", hue="Vehicle_brand", data=Price_dist,
           col_wrap=2, ci=None, palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1})

plot.set(ylim=(0, 600000)) # to exclude outlier on BMW
plot.set(xlim=(0, 800)) # to exclude outlier on Volkswagen

plt.show()

# Plot : Production Year vs. Price

In [None]:
sns.set_theme(style="ticks")

# Show the results of a linear regression within each dataset
plot = sns.lmplot(x="Production_year", y="Price_USD", col="Vehicle_brand", hue="Vehicle_brand", data=Price_dist,
           col_wrap=2, ci=None, palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1})

plot.set(ylim=(0, 500000)) # to exclude outlier on BMW and Merdesdes-benz

plt.show()