In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Loading data

In [None]:
data = pd.read_csv('../input/car-price-preprocessed/Car_price_preprocessed.csv')
data

## Exploring the PDFs

Let's first take a quick look at the descriptive statiscs of our features, this will give us a first idea of whether there are significant outliers or not.

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
data.describe()

From the table above whe can say that there are not many outliers in our numerical variables, but in order to confirm this hypothesis let's take a look at the PDFs.

In [None]:
sns.distplot(data['normalized-losses'])
plt.show()

Let's get rid of the outliers that we see in the right side of the PDF.

In [None]:
q = data['normalized-losses'].quantile(0.99)
data = data[data['normalized-losses']<q]
sns.distplot(data['normalized-losses'])
plt.show()

In [None]:
sns.distplot(data['wheel-base'])
plt.show()

In [None]:
sns.distplot(data['length'])
plt.show()

In [None]:
sns.distplot(data['width'])
plt.show()

In [None]:
sns.distplot(data['height'])
plt.show()

In [None]:
sns.distplot(data['curb-weight'])
plt.show()

In [None]:
sns.distplot(data['engine-size'])
plt.show()

Repeating the same process for engine size.

In [None]:
q = data['engine-size'].quantile(0.99)
data = data[data['engine-size']<q]
sns.distplot(data['engine-size'])
plt.show()

In [None]:
sns.distplot(data['bore'])
plt.show()

In this case we see that the outliers are in the left side, but the logic of the process is still the same.

In [None]:
q = data['bore'].quantile(0.01)
data = data[data['bore']>q]
sns.distplot(data['bore'])
plt.show()

In [None]:
sns.distplot(data['stroke'])
plt.show()

In [None]:
sns.distplot(data['compression-ratio'])
plt.show()

At first sight it seems like we have some outliers in the compression ratio feature, but a quick google search show us that the common range for compression ratio in for gas motors is 7-13 and for diesel motors is 16-24, so let's check if this explains what we see in the PDF.

In [None]:
data[data['compression-ratio'] >= 20]

In [None]:
data[data['compression-ratio'] <=13]

Just as expected, the two groups we saw in the compression ratio PDF where separated based on fuel type.

In [None]:
sns.distplot(data['horsepower'])
plt.show()

In [None]:
sns.distplot(data['peak-rpm'])
plt.show()

In [None]:
sns.distplot(data['city-mpg'])
plt.show()

In [None]:
sns.distplot(data['highway-mpg'])
plt.show()

## Checking the OLS assumptions

Let's plot the numerical values against the price to see what kind of relationship we can observe.

In [None]:
data.columns.values

In [None]:
num_features = ['symboling', 'normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'num-of-cylinders', 'engine-size', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

In [None]:
len(num_features)

In [None]:
for i in range(16):
    plt.scatter(data[num_features[i]], data['price'])
    plt.title('Price and {}'.format(num_features[i]))
    plt.show()

In [None]:
for i in range(16):
    plt.scatter(data[num_features[i]], np.log(data['price']))
    plt.title('Log Price and {}'.format(num_features[i]))
    plt.show()

In [None]:
for i in range(16):
    plt.scatter(np.log(data[num_features[i]]), np.log(data['price']))
    plt.title('Log Price and Log {}'.format(num_features[i]))
    plt.show()

Based on the previous plots we can say that a log transformation for the dependent variable will relax the linearity assumption for most of our variables.

In [None]:
data['Log-price'] = np.log(data['price'])
data.drop(['price'], axis=1, inplace=True)
data.head()

We can also see that some features aren't helpful to represent the behavior of the price, so we can just drop them.

In [None]:
data.drop(['compression-ratio'], axis=1, inplace=True)
data.drop(['stroke'], axis=1, inplace=True)
data.drop(['peak-rpm'], axis=1, inplace=True)
data.head()

The plots also show that the number of cylinders just make a difference in the price in two case, when there are 4 cilinders and when there are more than four, based on this we can turn it into a binomial variable.

In [None]:
data['num-of-cylinders'].unique()

In [None]:
data['num-of-cylinders'] = data['num-of-cylinders'].map({4:0, 5:1, 6:1})
data.head()

In [None]:
data = data.rename(columns={'num-of-cylinders':'+4 cylinders'})
data.head()

We can also take the logarithm of city-mpg and highway-mpg, but we can expect this variables to be highly correlated so let's just keep one of them.

In [None]:
data.drop(['city-mpg'], axis=1, inplace=True)
data['highway-mpg'] = np.log(data['highway-mpg'])
data.rename(columns={'highway-mpg':'Log highway-mpg'}, inplace=True)
data.head()

In [None]:
data.head()

Let's make an special transformation for symboling feature and try to make it more lineal.

In [None]:
plt.scatter(abs(data['symboling'] - 1) , data['Log-price'] ** 0.5)
plt.show()
data['symboling'] = abs(data['symboling'] - 1)
data.rename(columns={'symboling':'abs(symb-1)'}, inplace=True)
data.head()

In [None]:
linear_data = data.copy()

## Let's now check for multicolinearity with VIF

In [None]:
data.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = data[['abs(symb-1)', 'normalized-losses',
       'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'horsepower', 'Log highway-mpg']]

vif = pd.DataFrame()
vif['Features'] = variables.columns
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]

In [None]:
vif

We start eliminating highly correlated features until we end just with the following:

In [None]:
variables = data[['abs(symb-1)','normalized-losses']]

vif = pd.DataFrame()
vif['Features'] = variables.columns
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]

In [None]:
vif

In [None]:
for i in ['wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'horsepower', 'Log highway-mpg']:
    data.drop([i], axis=1, inplace=True)
data.head()

## Linear Regression Model

In [None]:
targets = data['Log-price']
inputs = data.drop(['Log-price'], axis=1)

Let's now scale our input data.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(inputs)
scaled_inputs = scaler.transform(inputs)

In [None]:
scaled_inputs

Now we should split our data into training and testing.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=97)

## Creating the regression

In [None]:
from sklearn.linear_model import LinearRegression
reg =  LinearRegression()
reg.fit(x_train, y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_train, y_hat)
# Let's also name the axes
plt.xlabel('Targets (y_train)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
plt.show()

In [None]:
# Another useful check of our model is a residual plot
# We can plot the PDF of the residuals and check for anomalies
sns.distplot(y_train - y_hat)

# Include a title
plt.title("Residuals PDF", size=18)

Let's now find the R-Squared of our model.

In [None]:
reg.score(x_train, y_train)

## Finding the weights and bias

In [None]:
reg.intercept_

In [None]:
reg_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
reg_summary['Weights'] = reg.coef_
reg_summary

## Testing the model

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
plt.scatter(y_test, y_hat_test)
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_hat_test)',size=18)
plt.xlim(8.5,10.5)
plt.ylim(8.5,10.5)
plt.show()

In [None]:
reg.score(x_test, y_test)

In [None]:
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=['Prediction'])
df_pf.head()

In [None]:
y_test = y_test.reset_index(drop=True)
df_pf['Target'] = np.exp(y_test)

In [None]:
df_pf

In [None]:
df_pf['Residual'] = df_pf['Target'] - df_pf['Prediction']
df_pf['Difference%'] = np.absolute(df_pf['Residual']/df_pf['Target']*100)
df_pf

In [None]:
df_pf.describe()

We can see that our average Difference% is 12.4% and the test R-Squared is 85.3%, so the predictive analysis has been succesful.