In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **IMPORT ESSENTIAL LIBRARIES**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
dataset = pd.read_csv('../input/car-price-prediction/CarPrice_Assignment.csv')

# **Separate Dependent and Independent Variables** 
- We extract the matrix of independent features (X) and dependent values (y) from the datatset
- We also delete the redundant features from the X matrix, this includes - 

    1. Car id : has no impact on predicted value
    2. Car Name: can be inferrred from symboling
    3. Engine Location: column has constant value of front in every row


In [None]:

X = np.delete(dataset.values,[0,2,8,25],1)
y = dataset.iloc[:,24:25].values
y_bwd = dataset.iloc[:,25].values

#  **Avoid dummy variable trap**

For the independent variable's categorical values, we perform One hot encoding in order to get them in binary form. Also, for given *n categories* we only need *n-1* binary columns

In [None]:
# Introduce dummy variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
columnTransformer = ColumnTransformer([('fuelType_encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 1 fuel type column


In [None]:
columnTransformer = ColumnTransformer([('aspiration_encoder', OneHotEncoder(), [2])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 1 aspiraion column

In [None]:
columnTransformer = ColumnTransformer([('doorNumber_encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 1 door column

In [None]:
columnTransformer = ColumnTransformer([('carBody_encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 3 car body columns

In [None]:
columnTransformer = ColumnTransformer([('driveWheel_encoder', OneHotEncoder(), [8])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 2 drive wheel columns

In [None]:
columnTransformer = ColumnTransformer([('engineType_encoder', OneHotEncoder(), [15])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep only 6 engine type columns

In [None]:
columnTransformer = ColumnTransformer([('cylinderNumber_encoder', OneHotEncoder(), [21])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep one less cylinder number column

In [None]:
columnTransformer = ColumnTransformer([('fuelSystem_encoder', OneHotEncoder(), [28])], remainder='passthrough')
X = columnTransformer.fit_transform(X)
X = X[:,1:] #Keep one less fuel system type columns

# **Backward Elimination**
We perform backward elimination on the matrix of independent variables (X).
We do this using OLS data and using the p-values of individual columns.
The significance level or SL = 0.05

In [None]:
# Backward Elimination
import statsmodels.api as sm
X = np.append(arr = np.ones((205,1)).astype(int),values=X,axis=1)

In [None]:
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y_bwd, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x

In [None]:
SL = 0.05
X = np.array(X,dtype=float)
X_bwd_elm = backwardElimination(X, SL)


# **Test-Train Split**
We split the given X & y matrices into test and train data

In [None]:
#Test Train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_bwd_elm,y,test_size=0.2,random_state=0)


# **Data Scaling**
The given test train data is now scaled.
This ensures that the significance of each independent variable properly impacts the predicted value (y)
We scale both X & y

In [None]:
#Scaling of data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_Y = StandardScaler()
y_train = sc_Y.fit_transform(y_train)
y_test = sc_Y.transform(y_test)


# **SVR Model created**

In [None]:
#SVR
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train,y_train)

y_pred = svr_regressor.predict(X_test)


# **Error Calculated**

In [None]:
from sklearn.metrics import mean_absolute_error
print ("Mean Absolute Error: ", (mean_absolute_error(y_test, y_pred)))