In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Car Auto Price

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.api import Logit
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv("/kaggle/input/autoprice/dataset_2193_autoPrice.csv")
data.head()

In [None]:
data.info()

The dataset is complete.

## Correlations

In [None]:
plt.figure(figsize=(10,6))

heatmap = sns.heatmap(data.corr(), vmin=-1,vmax=1, annot=True, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

There are some correlations between attributes and between class and some attributes. Let's see if is worthy to take a PCA on the predictor components.

In [None]:
Predictors = data[data.columns.values.tolist()[0:-1]]
Target = data[['class']]

In [None]:
corr_mat = Predictors.corr()

eig_vals, eig_vectors = np.linalg.eig(corr_mat)

total_sum = sum(eig_vals)
var_exp = [(i/total_sum)*100 for i in eig_vals]
cum_var_exp = np.cumsum(var_exp)

with plt.style.context('dark_background'):
    plt.figure(figsize=(16,9))
    plt.bar(range(15),cum_var_exp, label='Cummulative Variance Explained for components')
    plt.xlabel('Principal Components')
    plt.ylabel('Cummulative Variance Explained')
    plt.legend(loc=(0,0.9))
    for i in range(15):
        plt.text(x = i-0.5 , y = cum_var_exp[i]+1, s = '{:.2f}'.format(cum_var_exp[i]), size = 12)


Will take a PCA on the components to reduce dimensionality from 15 to 9 keeping the 96.25% of the infromation.

In [None]:
acp = PCA(n_components=9)
PCA_Predictors = acp.fit_transform(Predictors)

In [None]:
PCA_Predictors = pd.DataFrame(PCA_Predictors)
PCA_Predictors.head()

This is the dataset which is gonna be studied.

In [None]:
plt.figure(figsize=(10,6))

heatmap = sns.heatmap(PCA_Predictors.corr(), vmin=-1,vmax=1, annot=True, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

## Regression

### Train Test Split

In [None]:
X_tr, X_tst, Y_tr, Y_tst = train_test_split(PCA_Predictors,Target,test_size=0.25, shuffle=True, random_state=0)

### K Nearest Neighbours

In [None]:
KNN_m = KNeighborsRegressor()

KNN_m.fit(X_tr,Y_tr)

print('The accuracy given by R^2 has been: {:.4f}'.format(KNN_m.score(X_tst,Y_tst)))

Y_pred = KNN_m.predict(X_tst)
rmse = np.sqrt(mean_squared_error(Y_pred,Y_tst))
print('The model has a rmse of: {:.2f}'.format(rmse))

Price_mean = np.mean(Y_tst["class"])
error=rmse/Price_mean
print('The error is: {:.2f}%'.format(error*100))

## Decision Tree Regressor

In [None]:
DTR = DecisionTreeRegressor(random_state=42)

results = DTR.fit(X_tr,Y_tr)
print('The accuracy by the criterion',results.criterion, 'has been: {:.4f}'.format(results.score(X_tst,Y_tst)))

Y_pred = results.predict(X_tst)
rmse = np.sqrt(mean_squared_error(Y_pred,Y_tst))
print('The model has a rmse of: {:.2f}'.format(rmse))
Price_mean = np.mean(Y_tst["class"])
error=rmse/Price_mean
print('The error is: {:.2f}%'.format(error*100))

## Random Forest Regressor

In [None]:
RFR = RandomForestRegressor(oob_score=True, random_state=42)

results = RFR.fit(X_tr,np.array(Y_tr).ravel())

print('The Out Of Bag score is: {:.2f}'.format(results.oob_score))
print('The accuracy by the criterion',results.criterion,'has been: {:.2f}%'.format(results.score(X_tst,Y_tst)*100))

Y_pred = results.predict(X_tst)
rmse = np.sqrt(mean_squared_error(Y_pred,Y_tst))
print('The model has a rmse of: {:.2f}'.format(rmse))

Price_mean = np.mean(Y_tst["class"])
error=rmse/Price_mean
print('The error is: {:.2f}%'.format(error*100))

### The best regressor has been the Random Forest Regressor with an error of 19.05%