In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

For this work, 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

## Loading the dataset and doing basic EDA

#### Loading BMW and WM dataset

In [None]:
vm = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv")
bmw = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv")
vm["Car_class"]="VM"
bmw["Car_class"]="BMW"
dataset = pd.concat([vm, bmw])
dataset["car_age"] = 2020 - dataset["year"]

In [None]:
dataset.head()

#### Check whether the dataset contains null values

In [None]:
dataset.isnull().values.any()

In [None]:
dataset.isnull().sum()

The dataset doesnt have any null value

#### Data Exploration

In [None]:
sns.catplot(x = "car_age",hue="transmission", col= "Car_class", data=dataset, kind="count", height=6, aspect=2)

In [None]:
sns.countplot(y = "model", data=vm).set_title("VM cars by Model")

In [None]:
sns.countplot(y = "model", data=bmw).set_title("BMW cars by Model")

In [None]:
sns.pairplot(dataset, hue="Car_class")

In [None]:
data = dataset.drop(columns = ["year"])
sns.heatmap(data.corr(), annot= True)

#### Explore relationship between Price and Mileage given transmission and fuel type

From the relational chart shown below, it can be seen that there is a negative relationship between price and mileage. As mileage increases, price reduces.

In [None]:
sns.relplot(data=vm, x="mileage", y="price", hue="model",col ="transmission", row = "fuelType" )

In [None]:
sns.relplot(data=bmw, x="mileage", y="price", hue="model",col ="transmission", row = "fuelType" )

In [None]:
plt.figure(figsize=(15,10)) 
sns.scatterplot(data = dataset, x= "mileage", y="price", hue = "car_age").set(title = "Mileage per price")

## Data Modelling
#### Data Preparation

Convert categorical data to numeric using LabelEncoder and spliting data

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
dataset_trans = dataset.apply(le.fit_transform)

print(dataset_trans.head())

features = dataset_trans.drop(columns = ["year","price"])
target = dataset_trans["price"]

feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)
print(f"feature_train: {feature_train.shape} \n target_train:{target_train.shape} \n feature_test:{feature_test.shape} \n target_test:{target_test.shape}  ")


#### Ordinary Least Squares

In [None]:
ols = linear_model.LinearRegression()
olsfit = time()
ols.fit(feature_train, target_train)
olsfitT = round(time()-olsfit, 3)
olspred = time()
ols_price_pred = ols.predict(feature_test)
olspredT = round(time()-olspred, 3)
ols_coef = ols.coef_
ols_mse = mean_squared_error(target_test,ols_price_pred )
ols_r2 = r2_score(target_test,ols_price_pred)
ols_score = ols.score(feature_test, target_test)

print(f"Mean Squared Error:{ols_mse}, R squared error: {ols_r2}, Score: {ols_score}, Train time: {olsfitT}, Prediction time: {olspredT}")




In [None]:
ols_rst=pd.DataFrame()
ols_rst["year"] = 2020 - feature_test["car_age"]
ols_rst["price"] = target_test
ols_rst["predicted"] = ols_price_pred
ols_rst.plot(x='year', y=['price', 'predicted'], figsize=(25,15), grid=True, title="Predicted vs Actual using Ordinarl Least square")
