In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings("ignore")
plt.style.use("bmh")

In [None]:
df_ford = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/ford.csv")
df_toyota = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/toyota.csv")
df_vw = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv")
df_bmw = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv")

In [None]:
df_ford["model"] = df_ford["model"].apply(lambda x: x[1:])
df_toyota["model"] = df_toyota["model"].apply(lambda x: x[1:])
df_vw["model"] = df_vw["model"].apply(lambda x: x[1:])
df_bmw["model"] = df_bmw["model"].apply(lambda x: x[1:])

In [None]:
gbp_to_eur = lambda x: x*1.17
df_ford["price"] = df_ford["price"].apply(gbp_to_eur)
df_toyota["price"] = df_toyota["price"].apply(gbp_to_eur)
df_vw["price"] = df_vw["price"].apply(gbp_to_eur)
df_bmw["price"] = df_bmw["price"].apply(gbp_to_eur)

In [None]:
miles_to_km = lambda x: x*1.60934
df_ford["mileage"] = df_ford["mileage"].apply(miles_to_km)
df_toyota["mileage"] = df_toyota["mileage"].apply(miles_to_km)
df_vw["mileage"] = df_vw["mileage"].apply(miles_to_km)
df_bmw["mileage"] = df_bmw["mileage"].apply(miles_to_km)

In [None]:
df_ford.rename(columns={"mileage": "kilometers"}, inplace=True)
df_toyota.rename(columns={"mileage": "kilometers"}, inplace=True)
df_vw.rename(columns={"mileage": "kilometers"}, inplace=True)
df_bmw.rename(columns={"mileage": "kilometers"}, inplace=True)

In [None]:
mpg_to_kmpl = lambda x: x*0.425144
df_ford["mpg"] = df_ford["mpg"].apply(mpg_to_kmpl)
df_toyota["mpg"] = df_toyota["mpg"].apply(mpg_to_kmpl)
df_vw["mpg"] = df_vw["mpg"].apply(mpg_to_kmpl)
df_bmw["mpg"] = df_bmw["mpg"].apply(mpg_to_kmpl)

In [None]:
df_ford.rename(columns={"mpg": "kmpl"}, inplace=True)
df_toyota.rename(columns={"mpg": "kmpl"}, inplace=True)
df_vw.rename(columns={"mpg": "kmpl"}, inplace=True)
df_bmw.rename(columns={"mpg": "kmpl"}, inplace=True)

In [None]:
df_ford.head()

In [None]:
df_ford.fuelType.unique()

In [None]:
pd.get_dummies(df_ford, columns=["transmission", "fuelType"])

In [None]:
plt.figure(figsize=(7,6))
sns.heatmap(df_ford[["price", "kilometers", "tax", "kmpl", "engineSize", "year"]].corr(method="spearman"), annot=True, cmap=plt.cm.RdBu, linewidths=0.1);

In [None]:
models_ford_names = df_ford.model.value_counts().index.to_list()
models_ford_occurences = df_ford.model.value_counts().to_list()
df_models_ford = pd.DataFrame({"occurences": models_ford_occurences}, index=models_ford_names)

In [None]:
ax = df_models_ford.plot(kind="barh", figsize=(5,8), title="Ford")
ax.invert_yaxis()

In [None]:
df_ford[df_ford.model=="Focus"]["price"].plot(kind="hist", figsize=(12,4), bins=50, title="Distribution of Focus prices", rwidth=0.8);

In [None]:
_ = stats.probplot(df_ford[df_ford.model=="Focus"]["price"], plot=plt)

In [None]:
fig, axs = plt.subplots(5,1, figsize=(12,18))
for i, column in enumerate(["kilometers", "tax", "kmpl", "engineSize", "year"]):
    sns.scatterplot(data=df_ford[(df_ford.year<2021) & (df_ford.model=="Focus")], x="price", y=column, ax=axs[i])

In [None]:
df_ford[(df_ford.model=="Fiesta") & (df_ford.kilometers > 30000) & (df_ford.kilometers < 50000) & (df_ford.engineSize<=1.5)].plot(x="year", y="price", kind="scatter");

In [None]:
df_ford[(df_ford.model=="Fiesta") & (df_ford.kilometers > 30000) & (df_ford.kilometers < 50000) & (df_ford.engineSize<=1.5)].plot(x="kmpl", y="price", kind="scatter");

In [None]:
df_ford.plot(x="kilometers", y="year", figsize=(12,12), kind="scatter", alpha=0.2);

In [None]:
plt.figure(figsize=(15,5))
ax = plt.axes()
df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2018) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0) & (df_ford.kmpl>24) & (df_ford.kmpl<26)].plot(x="kilometers", y="price", kind="scatter", ax=ax, c="b", alpha=0.5);
df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2019) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0) & (df_ford.kmpl>24) & (df_ford.kmpl<26)].plot(x="kilometers", y="price", kind="scatter", ax=ax, c="r", alpha=0.5);

In [None]:
df1 = df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2019) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0)]
df2 = df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2018) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0)]
df3 = df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2017) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0)]
df4 = df_ford[(df_ford.model=="Fiesta") & (df_ford.year==2016) & (df_ford.transmission=="Manual") & (df_ford.fuelType=="Petrol") & (df_ford.engineSize==1.0)]

In [None]:
plt.figure(figsize=(15,10))
ax = plt.axes()
sns.regplot(data=df1, x="kilometers", y="price", ax=ax)
sns.regplot(data=df2, x="kilometers", y="price", ax=ax)
sns.regplot(data=df3, x="kilometers", y="price", ax=ax)
sns.regplot(data=df4, x="kilometers", y="price", ax=ax)
ax.set_xlim(0, 60000)
ax.set_ylim(8000, 20000);