In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

## Understanding dataset

In [None]:
df= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/bmw.csv")
df.head()

In [None]:
df.describe()

In [None]:
#encoding categorical variable
df = pd.get_dummies(data = df, columns = ['model','transmission','fuelType'])
df.head()

In [None]:
X = df.drop(['price'], axis =1)
y = df['price']

## Check for missing values

In [None]:
df.isnull().sum()

No missing values

## Checking Normality assumption

In [None]:
stats.shapiro(X)

In [None]:
stats.normaltest(X)

data is not normally distributed since p value is less than alpha so null hypothesis that data is normally distributed is rejected.

## Checking VIF for multi collinearity

In [None]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
get_vif(X)

In [None]:
X_new= X.drop(['transmission_Semi-Auto','model_ X1','fuelType_Petrol','year'],axis = 1)

In [None]:
get_vif(X_new)

## Standardization of X and y

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)
y = (y -y.mean())/y.std()

## Build the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25,
                                                    random_state =42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

model_1 = LinearRegression()
model_1.fit(X_train, y_train)

model_2 = KNeighborsRegressor()
model_2.fit(X_train, y_train)

model_3 = RandomForestRegressor(n_estimators=50)
model_3.fit(X_train, y_train)

model_4 = XGBRegressor(objective='reg:linear')
model_4.fit(X_train, y_train)

In [None]:
#calculating RMSE
def rmse(model):
    y_train_pred = model.predict(X_train)
    rmse_train = round(np.sqrt(mean_squared_error(y_train,y_train_pred)),3)
    
    y_test_pred = model.predict(X_test)
    rmse_test = round(np.sqrt(mean_squared_error(y_test, y_test_pred)),3)
    
    print("train: ", rmse_train, "test: ", rmse_test)    

In [None]:
print("Accuracy on training set using Linear Regression: {:.2f}".format(model_1.score(X_train,y_train)))
print("Accuracy on testing set using Linear Regression: {:.2f}".format(model_1.score(X_test,y_test)))


print("Accuracy on training set using Random Forest Regression: {:.2f}".format(model_3.score(X_train,y_train)))
print("Accuracy on testing set using Random Forest Regression: {:.2f}".format(model_3.score(X_test,y_test)))

print("Accuracy on training set using KNeighborsRegression: {:.2f}".format(model_2.score(X_train,y_train)))
print("Accuracy on testing set using KNeighborsRegression: {:.2f}".format(model_2.score(X_test,y_test)))


print("Accuracy on training set using XGBoostRegression: {:.2f}".format(model_4.score(X_train,y_train)))
print("Accuracy on testing set using XGBoostRegression: {:.2f}".format(model_4.score(X_test,y_test)))


In [None]:
rmse(model_4)