In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# INTRODUCTION

This notebook contains analysis and predictions of carprices from the dataset provided.
The notebook is divided into 3 parts which consist of three different models. The intention of each model has been specified. The results obtained from each model has been tabulated. Finally a comparison has been done at the end.

In [None]:
import pandas as pd

file_path = '../input/car-price-prediction/CarPrice_Assignment.csv'

## EXPLORING THE AVAILABLE DATA

car_data = pd.read_csv(file_path)
car_data.head()

In [None]:
car_data.columns

## CONVERTING CATEGORICAL DATA TO NUMERIC FOR EASE OF ANALYSIS
## KEY:
## CARBODY---- HATCHBACK:0 SEDAN:1 HARDTOP:2 WAGON:3 CONVERTIBLE:4
## FUELTYPE---- GAS:1 DIESEL:-1
## DOORNUMBER--- FOUR:4 TWO:2
## ASPIRATION--- STANDARD:0 TURBO:1
## DRIVEWHEEL--- RWD:0 FWD:1 4WD:2
## CYLINDERNUMBER---- TWO to TWELVE:2-12
## FUELSYSTEM---- MPFI:0 1BBL:1 2BBL:2 4BBL:3 MFI:4 SPFI:5 IDI:6 SPDI:7

data = car_data.copy()
data['carbody'] = data['carbody'].map({'hatchback':0,'sedan':1,'hardtop':2,'wagon':3,'convertible':4})
data['fueltype'] = data['fueltype'].map({'gas':1,'diesel':-1})
data['doornumber'] = data['doornumber'].map({'four':4,'two':2})
data['aspiration'] = data['aspiration'].map({'std':0,'turbo':1})
data['drivewheel'] = data['drivewheel'].map({'rwd':0,'fwd':1,'4wd':2})
data['cylindernumber'] = data['cylindernumber'].map({'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12})
data['fuelsystem'] = data['fuelsystem'].map({'mpfi':0,'1bbl':1,'2bbl':2,'4bbl':3,'mfi':4,'spfi':5,'idi':6,'spdi':7})




In [None]:
y = data.price   ## PREDICTION TARGET
X = data[['carbody','enginesize','peakrpm']] ## TASK 1 FEATURES

## MOST OF THE FEATURES
Z = data[['symboling','doornumber','aspiration','carbody','enginesize','fuelsystem','peakrpm','fueltype','cylindernumber','drivewheel','carlength','carwidth','carheight','wheelbase','boreratio','stroke','compressionratio','curbweight','horsepower','citympg','highwaympg']]


In [None]:
## ADJUSTED R-SQ FUNCTION
def adj_r2(x,y):
    r2 = model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

# ****REGRESSION 1 (ONLY THE THREE VARIABLES MENTIONED IN TASK 1)

Multiple Regression is performed on the dataset. This regression pertains to Task 1 only. Car price predictions are made based on:
* Car Body
* Engine Size
* Peak Rpm


In [None]:
## SPLITTING DATASET INTO TRAINING DATA AND VALIDATION DATA
from sklearn.model_selection import train_test_split

train_X,val_X,train_y, val_y = train_test_split(X,y,random_state=1)

In [None]:
## PERFORMING REGRESSION
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_X,train_y)

predicted = model.predict(val_X)
val_X['actual_price']=val_y
val_X['predictions']=predicted
val_X



# RESULTS

In [None]:
## R-SQUARED
r2=model.score(train_X,train_y)
r2


In [None]:
## ADJUSTED R-SQUARED
ar2=adj_r2(train_X,train_y)
ar2

In [None]:
## MEAN ABSOLUTE ERROR
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(val_y,predicted)
mae

In [None]:
report = pd.DataFrame(data=[r2],columns=['R-SQUARED'])
report['ADJUSTED R-SQ']=ar2
report['MEAN ABS. ERROR']=mae
report

**CONCLUSION**

The model gives a reasonable prediction about the car prices based on
* Car Body
* Engine Size
* Peak Rpm

**WHAT NEXT**

However it remains to be seen if there are other sets of features which can give a better model for this prediction. Our main aim is to increase the R-Squared and Adjusted R-Squared but reduce the mean absolute error.
To fulfill our purpose we first perform a regression taking all the variables in the dataset. We then check the result of this regression and record it.

# ****REGRESSION 2 (CONSIDERING MANY MORE VARIABLES AND GRADUALLY NARROWING DOWN TO 5 MOST ESSENTIAL)

In [None]:
## REGRESSION WITH ALL VARIABLES-21 (EXCEPT CAR_ID CAR_COMPANY AND ENGINE_TYPE; DUE TO ANOMALY IN DATA)
from sklearn.model_selection import train_test_split
## SPLITTING DATASET INTO TRAINING AND VALIDATION PARTS
train_Z,val_Z,train_y, val_y = train_test_split(Z,y,random_state=1)

In [None]:
## PERFORMING REGRESSION
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_Z,train_y)

predicted2 = model.predict(val_Z)
val_Z['actual_price']=val_y
val_Z['predictions']=predicted2
val_Z

# RESULTS

In [None]:
## R-SQUARED
r2=model.score(train_Z,train_y)


In [None]:
## ADJUSTED R-SQUARED
ar2=adj_r2(train_Z,train_y)

In [None]:
## MEAN ABSOLUTE ERROR
from sklearn.metrics import mean_absolute_error
mae2 = mean_absolute_error(val_y,predicted2)
mae2

In [None]:
report = pd.DataFrame(data=[r2],columns=['R-SQUARED'])
report['ADJUSTED R-SQ']=ar2
report['MEAN ABS. ERROR']=mae2
report

**CONCLUSION**

Taking all the variables into account, it is clearly visible that we have managed to improve our model. R-Squared and Adjusted R-Squared values have increased significantly and Mean Absolute Error has decreased by roughly 400.

**WHAT NEXT**

However taking 21 features into consideration will be a big task. Our next goal is to determine the 5 most significant features that can be used to create an accurate model to predict the price of the cars. To achieve this, we will calculate the p-values of each feature separately with respect to the car price.

Features having p-values>0.05 can be deemed insignificant easily

We then collect the features for which p-values tend to be very small(features of greater significance)

The p-values are then checked till greater decimal digits to determine the 5 smallest p-values and hence correspondingly, the 5 most significant features affecting the car prices.

We then construct a final regression model with these 5 features only.


# ****UNIVARIATE P VALUES FOR DETERMINING SIGNIFICANCE OF VARIABLES

In [None]:
## USING F_REGRESSION FOR CALCULATING P-VALUES
from sklearn.feature_selection import f_regression
f_regression(train_Z,train_y)
p_values = f_regression(train_Z,train_y)[1]
p_values.round(27)

In [None]:
cars = pd.DataFrame(data=train_Z.columns.values,columns=['Features'])
cars['p-values'] = p_values.round(27)
cars

# RESULTS

**FROM THIS TABLE, WE CONCLUDE THE 5 MOST SIGNIFICANT FEATURES TO BE**
* Engine Size(enginesize)
* Number of Cylinders(cylindernumber)
* Type of Drivewheel(drivewheel)
* Weight of Car without Occupants(curbweight)
* Horsepower(horsepower)

# ***REGRESSION 3(CONSIDERING ONLY THE SIGNIFICANT NUMERIC VARIABLES)

Regression 3 is pergormed only with the 5 significant features.

In [None]:
y = data.price ## PREDICTION TARGET
A = data[['enginesize','cylindernumber','drivewheel','curbweight','horsepower']] ## FEATURES

In [None]:
## SPLITTING THE DATASET INTO TRAINING AND VALIDATION PARTS
from sklearn.model_selection import train_test_split

train_A,val_A,train_y, val_y = train_test_split(A,y,random_state=1)

In [None]:
## PERFORMING REGRESSION
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_A,train_y)

predicted3 = model.predict(val_A)
val_A['actual_price']=val_y
val_A['predictions']=predicted3
val_A

# RESULTS

In [None]:
## R-SQUARED
r2=model.score(train_A,train_y)


In [None]:
## ADJUSTED R-SQUARED
ar2=adj_r2(train_A,train_y)

In [None]:
## MEAN ABSOLUTE ERROR
from sklearn.metrics import mean_absolute_error
mae3 = mean_absolute_error(val_y,predicted3)
mae3

In [None]:
report = pd.DataFrame(data=[r2],columns=['R-SQUARED'])
report['ADJUSTED R-SQ']=ar2
report['MEAN ABS. ERROR']=mae3
report

**FINAL CONCLUSION**

Reducing the number of features is bound to reduce the predictive accuracy. This is evident from the decrease in values of R-Squared and Adjusted R-Squared. However the reduced value of these statistics are still more than 80% which is reasonably good and also better than the variables considered in Task 1.

It is also observed that Mean Absolute Error has reduced further as compared to Regression 2, thus the final model (Regression 3) can be considered to be a good model for predicting the price of cars without considering all the 26 features. The predictions are reasonably accurate.

Also the motor company should focus on these 5 main features for predicting the price of the cars:
* Engine Size(enginesize)
* Number of Cylinders(cylindernumber)
* Type of Drivewheel(drivewheel)
* Weight of Car without Occupants(curbweight)
* Horsepower(horsepower)