In [None]:
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
bmw= pd.read_csv('../input/used-car-dataset-ford-and-mercedes/bmw.csv')
bmw.head()

In [None]:
bmw.info()

#### Will make the object type columns to category type so that we can perform linear regression model including those features also

In [None]:
bmw.model=bmw.model.astype('category')
bmw['Model']=bmw.model.cat.codes
bmw.drop('model', axis=1, inplace=True)

In [None]:
bmw.year=bmw.year.astype('category')
bmw['Year']=bmw.year.cat.codes
bmw.drop('year', axis=1, inplace=True)

In [None]:
bmw.transmission=bmw.transmission.astype('category')
bmw['Transmission']=bmw.transmission.cat.codes
bmw.drop('transmission', axis=1, inplace=True)

In [None]:
bmw.fuelType=bmw.fuelType.astype('category')
bmw['Fuel']=bmw.fuelType.cat.codes
bmw.drop('fuelType', axis=1, inplace=True)

In [None]:
# now our dataset looks something like this:
bmw.head()

## Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
bmw_train, bmw_test= train_test_split(bmw, test_size=0.3, random_state=100)

In [None]:
# going with bmw_train dataset to make our model

In [None]:
bmw_train.columns

In [None]:
var= ['price', 'mileage', 'tax', 'mpg', 'engineSize', 'Model', 'Year',
       'Transmission', 'Fuel']

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
bmw_train[var]= scaler.fit_transform(bmw_train[var])

In [None]:
# After scaling our dataset looks something like this:
bmw_train.head()

#### our target variable is price(y)

In [None]:
y= bmw_train.pop('price')

In [None]:
import statsmodels.api as sm

In [None]:
x= sm.add_constant(bmw_train)

In [None]:
lr01= sm.OLS(y,x).fit()
print(lr01.summary())

##### 'mpg' p-value is more than 5%, which means it is insignificant, so we will drop 'mpg' from our dataset

In [None]:
bmw_train.drop('mpg', axis=1, inplace=True)

In [None]:
# now, again building the model with remaing features/columns/independent variable
X= sm.add_constant(bmw_train)

In [None]:
lr02= sm.OLS(y,X).fit()
print(lr02.summary())

In [None]:
# every feature is significant
# now we will check the nulti-collinearity among the features/columns/independent variables using VIF (Variance Inflation Factor)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = bmw_train.columns
vif['VIF'] = [variance_inflation_factor(bmw_train.values, i) for i in range(bmw_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### VIF more than 10 means high collinearity, so first we will drop 'engineSize'

In [None]:
bmw_train.drop('engineSize',axis=1,inplace=True)

In [None]:
# building model with remaing independent variables/ features
x=sm.add_constant(bmw_train)

In [None]:
lr03=sm.OLS(y,x).fit()
print(lr03.summary())

##### All the features are significant, now we will look into their VIF

In [None]:
vif = pd.DataFrame()
vif['Features'] = bmw_train.columns
vif['VIF'] = [variance_inflation_factor(bmw_train.values, i) for i in range(bmw_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### All the features are having VIF less than 10, so **lr03** will be our final model

### Residual Analysis of train dataset(bmw_train)

In [None]:
y_pred= lr03.predict(x)

In [None]:
residual= y-y_pred

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot(residual)

##### Our resiual is normally distributed and having mean value nearly 0 

## Test Dataset

In [None]:
bmw_test.head()

In [None]:
bmw_test.columns

In [None]:
var01=['price', 'mileage', 'tax', 'mpg', 'engineSize', 'Model', 'Year',
       'Transmission', 'Fuel']

In [None]:
# scaling the features
bmw_test[var01]=scaler.transform(bmw_test[var01])

In [None]:
# after scaling our dataset looks something like this:
bmw_test.head()

In [None]:
# dropping 'mpg' and 'engineSize' from our test_dataset
bmw_test.drop(['mpg','engineSize'], axis=1, inplace=True)

In [None]:
# seperating target variable from the test_dataset
y_test=bmw_test.pop('price')

In [None]:
x_test=sm.add_constant(bmw_test)

In [None]:
# prediction on test_dataset using lr03 model, which we have build on the train_dataset
y_test_pred=lr03.predict(x_test)

#### Residual Analysis

In [None]:
residual01=y_test-y_test_pred

In [None]:
sns.distplot(residual01)

In [None]:
plt.scatter(y_test, y_test_pred)
plt.xlabel('y_test', fontsize=10)
plt.ylabel('y_test_pred', fontsize=10)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# RMSE value
np.sqrt(mean_squared_error(y_test, y_test_pred))

In [None]:
# R_squared of test_dataset
R_squared = r2_score(y_test, y_test_pred)
R_squared

### Final Equation

In [None]:
print(lr03.summary())

#### Price = -0.1862 - 0.1599*mileage + 0.1083*tax + 0.1339*Model + 0.3704*Year + 0.0091*Transmission - 0.0040*Fuel