# <center><u> CarPrice Data Set</u></center>

# 1. Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score
import warnings
warnings.filterwarnings("ignore")

## 2. Data Loading

In [None]:
carprice = pd.read_csv('../input/car-price-prediction/CarPrice_Assignment.csv')

## 3. Data Description

In [None]:
carprice.head()

In [None]:
print('\033[1mRows :\033[0m' , carprice.shape[0])
print('\033[1m\nColumns :\033[0m', carprice.shape[1])
print('\033[1m\nFeatures :\033[0m', carprice.columns.tolist())
print('\033[1m\nNull Values :\033[0m',carprice.isnull().sum().values.sum())

In [None]:
carprice.info()

In [None]:
carprice.describe()

In [None]:
import scipy.stats as stats

## 4. Bivariate Analysis (for Feature Selection)

### 4.1. Distribution of Each Numerical Attribute against Price

In [None]:
plt.figure(figsize=(8,8))
for i in carprice.columns:
    
    if carprice[i].dtype=='O':
            sns.boxplot(x=carprice[i], y=carprice['price'],data=carprice)
            plt.show()
            

From the above box plots we can see that there is nout much difference in price when the door numbers are two or four. Therefore assuming a very low correlation we will drop the door number column. For the car name column we cant clearly see the labels but the plot is some what depicting a high correlation with price. 

To visualize this better lets split the column and also rename the values in it.

In [None]:
carprice.columns

In [None]:
carprice['CarName']

Looking at the values we can see spelling mistakes in the company names as well as car model names. We will therefore split the column into company name and car model name.

In [None]:
x = carprice['CarName'].str.split(" ", expand=True)

In [None]:
carprice['Company'] = x[0].values

In [None]:
carprice['Company'].value_counts()

In [None]:
carprice['Company'] = carprice['Company'].replace({'toyouta': 'Toyota','vw':'Volkswagen','vokswagen':'Volkswagen',
                                                      'maxda':'Mazda','porcshce':'Porsche'})

In [None]:
carprice['Company'] = carprice['Company'].str.title()

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(carprice['Company'], y=carprice['price'], data=carprice)
plt.xticks(rotation=60)
plt.show()

From th above visulizations we can clearly see how widely the prices vary from company to company. We therefore can use this feature to train our model and predict price based on company names rather than using car name (models).

In [None]:
carprice.columns

In [None]:
df =carprice.copy()

## 5. Feature Selection

### 5.1. Pearson Correlation

In [None]:
correlation = carprice.corr()

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(abs(correlation), annot=True, cmap='coolwarm')

In [None]:
correlation.price

From the correlation matrix we found out that carheight, stroke, compressionratio and peakrpm have no noticable affect on the price of the cars therefore, we will drop these columns. car_ID column is also irrelevant for the prediction of car price. From the box plot visualizations above we saw carNames and doornumber attribute can also be dropped.

In [None]:
carprice.drop(columns =['car_ID','carheight', 'stroke', 'compressionratio','peakrpm', 'CarName', 'doornumber'],inplace=True)

In [None]:
labeel_for_DS = carprice['price'].copy() #Will use it in the Random Classifier

## 6. Normalization of Numerical Features

In [None]:
from scipy.stats import zscore
numeric_cols = carprice.select_dtypes(include=[np.number]).columns
carprice[numeric_cols] = carprice[numeric_cols].apply(zscore)

In [None]:
carprice.head()

In [None]:
carprice.info()

## 7. Feature Encoding

In [None]:
for i in carprice.columns:
    if carprice[i].dtype =='O':
        print(i+' : ',carprice[i].unique())

### 7.1. Manual Encoding

For categorical features that have two unique values, we will  manually encode them.

In [None]:
carprice.fueltype = carprice.fueltype.map({'gas': 1,'diesel':0})

In [None]:
carprice.aspiration = carprice.aspiration.map({'std':1, 'turbo':0})

In [None]:
carprice.enginelocation = carprice.enginelocation.map({'front':1,'rear':0})

In [None]:
carprice.head()

### 7.2. One Hot Encoding

For Categorical variables having more than two unique values we will hot encode those features

In [None]:
for i in carprice.columns:
    if carprice[i].dtype == 'O' and carprice[i].nunique() >2:
        #print(i, i+'_Dummies')
        carprice.Dummies =pd.get_dummies(carprice[i])
        carprice = pd.concat([carprice, carprice.Dummies], axis=1)
        carprice.drop(columns=[i], inplace=True)
        

In [None]:
carprice.head()

In [None]:
df2=carprice.copy()

## 8. Machine Learning

In [None]:
# Target Variable
y = carprice['price']

#Features
x= carprice.drop(columns=['price'])

### 8.1 Transforming Features and Target Variables into Arrays

We need to convert both x and y into arrays before applying the Linear Regression Model

In [None]:
# First converting the Features into Dictionary
x = x.to_dict(orient='records')

#Importing vectorizer to convert Dictionary to array
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x = vec.fit_transform(x).toarray()

#converting our target variable into array
y = np.asarray(y)


<b> Before splitting the data its worth mentioning here that we havent removed the outliers. I think the outliers here may represent a real picture for eg prices of some cars may in real world be too high. Therefore I am of the opinion that removing outliers in the data set at hand would be not a wise thing to do.</b>

### 8.2 Train Test Split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
print("xtrain shape : ", xtrain.shape," :: xtest shape  : ", xtest.shape," :: ytrain shape : ", ytrain.shape," :: ytest shape  : ", ytest.shape) 

## 8.3 Linear Regression Model

In [None]:
regressor = LinearRegression()

### 8.3.1 Fitting Linear Regression Model

In [None]:
# Model Training
regressor.fit(xtrain, ytrain)

#Model Prediction
y_pred_linear = regressor.predict(xtest) 

### 8.3.2 Linear Regression Evaluation

In [None]:
plt.style.use('ggplot')
plt.scatter(ytest, y_pred_linear, c = 'blue') 
plt.xlabel("Expected") 
plt.ylabel("Predicted value") 
plt.title("True value vs predicted value : Linear Regression") 
plt.show() 

In [None]:
print('\033[1mMean Squared Error is:\033[0m', metrics.mean_squared_error(ytest, y_pred_linear))  
print('\033[1mMean Absolute Error is:\033[0m', metrics.mean_absolute_error(ytest, y_pred_linear))  
print('\033[1mRoot Mean Squared Error is:\033[0m', np.sqrt(metrics.mean_squared_error(ytest, y_pred_linear)))

In [None]:
regressor.coef_

#### 8.3.2.1 Visualization of Actual vs Predicted Values

In [None]:
df = pd.DataFrame({'Actual': ytest.flatten(), 'Predicted': y_pred_linear.flatten()})

In [None]:
df.head(10)

In [None]:
df = df.head(25)
df.plot(kind='bar',figsize=(10,5))
plt.grid(which='major', linestyle=':', linewidth='0.99', color='black')
plt.xlabel("No. of Records")
plt.ylabel("Values")
plt.show()

## 8.4 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

### 8.4.1 Fitting Random Forest Regressor

In [None]:
#Model Training
rfr = RandomForestRegressor(n_estimators =1000, random_state=42)
rfr.fit(xtrain,ytrain);

#Model Prediction
y_pred_rfr = rfr.predict(xtest)

### 8.4.2 Random Forest Regressor Evaluation

In [None]:
print('\033[1mMean Absolute Error:\033[0m', metrics.mean_absolute_error(ytest,y_pred_rfr))
print('\033[1mMean Squared Error:\033[0m', metrics.mean_squared_error(ytest, y_pred_rfr))
print('\033[1mRoot Mean Square Error:\033[0m', np.sqrt(metrics.mean_squared_error(ytest, y_pred_rfr)))

## 8.5 Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

### 8.5.1 Applying Ridge Regression

In [None]:
ridge = Ridge()
parameters = {'alpha': [1e-15,1e-10,1e-8,1e-5,1e-4,1e-3,1e-2,1,5,10,20,30,40,45,50,55,60,100]}
ridge_regressor = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5)
ridge_regressor.fit(xtrain,ytrain);

In [None]:
print("The best fit alpha value is found out to be :" ,ridge_regressor.best_params_)
print("\nUsing ",ridge_regressor.best_params_, " the negative mean squared error is: ", ridge_regressor.best_score_)

In [None]:
#Model Prediction
y_pred_ridge = ridge_regressor.predict(xtest)

### 8.5.2 Ridge Regressor Evaluation

In [None]:
print("\033[1mUSING ALPHA =1")
print("\nMean Sqaured Error for Ridge Regression is : \033[0m", metrics.mean_squared_error(ytest, y_pred_ridge))

#### Actual vs Predicted

In [None]:
plt.figure(figsize=(8,5))
plt.plot(y_pred_ridge)
plt.plot(ytest)
plt.legend(["Predicted","Actual"])
plt.show()

## 8.6 Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

### 8.6.1 Applying Lasso Regression

In [None]:
lasso = Lasso()
paramaters = {'alpha': [1e-15,1e-13,1e-10,1e-8,1e-5,1e-4,1e-3,1e-2,1e-1,1,5,10,20,30,40,45,50,55,60,100]}
lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=3)
lasso_regressor.fit(xtrain, ytrain);

In [None]:
print("The best fit alpha value is found out to be :" ,lasso_regressor.best_params_)
print("\nUsing ",lasso_regressor.best_params_, " the negative mean squared error is: ", lasso_regressor.best_score_)

In [None]:
#Prediction using Lasso
y_pred_lasso = lasso_regressor.predict(xtest)

### 8.6.2 Lasso Regressor Evaluation

In [None]:
print("\033[1mUSING ALPHA =0.001")
print("\nMean Sqaured Error for Lasso is : \033[0m", metrics.mean_squared_error(ytest, y_pred_lasso))

#### Actual vs Predicted Graph

In [None]:
plt.figure(figsize=(8,5))
plt.plot(y_pred_lasso)
plt.plot(ytest)
plt.legend(["Predicted","Actual"])
plt.show()

## 8.7. Elastic Net Regression 

In [None]:
from sklearn.linear_model import ElasticNet

### 8.7.1. Applying Elastic Net Regression

In [None]:
elastic = ElasticNet()
paramaters = {'alpha': [1e-15,1e-13,1e-10,1e-8,1e-5,1e-4,1e-3,1e-2,1e-1,1,5,10,20,30,40,45,50,55,60,100]}
elastic_regressor = GridSearchCV(elastic, parameters, scoring='neg_mean_squared_error',cv=5)
elastic_regressor.fit(xtrain, ytrain);

In [None]:
print("The best fit alpha value is found out to be :" ,elastic_regressor.best_params_)
print("\nUsing ",elastic_regressor.best_params_, " the negative mean squared error is: ", elastic_regressor.best_score_)

In [None]:
y_pred_elastic = elastic_regressor.predict(xtest)

### 8.7.2. ElasticNet Regressor Evaluation

In [None]:
print("\033[1mUSING ALPHA =0.001")
print("\nMean Sqaured Error for Elastic Net Regression is : \033[0m", metrics.mean_squared_error(ytest, y_pred_elastic))

#### Actual vs Prediction

In [None]:
plt.figure(figsize=(8,5))
plt.plot(y_pred_elastic)
plt.plot(ytest)
plt.legend(["Predicted","Actual"])
plt.show()

## 8.8 Random Forest Classifier

### 8.8.1 Building Classifer for Random Forest

In [None]:
# Categorizing Target variable as High for income >11000 and Low for  <11000
y2 =np.where(labeel_for_DS>11000, 'High',"Low") 

#Manual Encoding of Target Varibale
y2 = np.where(y2=='High',1,0)

###  8.8.2 Train Test Split

In [None]:
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(x,y2,test_size=0.3, random_state=42 )

In [None]:
print(xtrain2.shape,xtest2.shape,ytrain2.shape,ytest2.shape)

### 8.8.3 Applying Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model = rfc.fit(xtrain2, ytrain2)
y_pred_rfc = rfc.predict(xtest2)

### 8.8.4 Random Forest Classifier Evaluation:

#### Classification Report

In [None]:
print ("\n \033[1m Classification report : \033[0m\n",classification_report(ytest2, y_pred_rfc))
print ("\n \033[1m Accuracy : \033[0m\n",metrics.accuracy_score(ytest2, y_pred_rfc))

#### Confusion Matrix

In [None]:
plt.style.use('ggplot')
cf_matrix = confusion_matrix(y_pred_rfc, ytest2)
x_y_labels = ['High','Low']
sns.heatmap(cf_matrix.T, square=True, annot=True, xticklabels=x_y_labels, yticklabels=x_y_labels)
plt.xlabel('Predicted label')
plt.ylabel('Actual label');

## 8.9. Model Metrics Evaluation

In [None]:
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    #accuracy     = accuracy_score(testing_y,predictions)
    mean_sq_err  = metrics.mean_squared_error(testing_y,predictions)
    mean_abs_err = metrics.mean_absolute_error(testing_y,predictions)
    Rmean_sq_err = np.sqrt(metrics.mean_squared_error(testing_y,predictions) )
        
    df = pd.DataFrame({"Model"                  : [name],
                       "Mean Square Error"      : [mean_sq_err],
                       "Mean Absolute Error"    : [mean_abs_err],
                       "Root Mean Square Error" : [Rmean_sq_err],
                       
                      })
    return df

model1 = model_report(regressor,xtrain,xtest,ytrain,ytest,
                      "Linear Regression")

model2 = model_report(ridge_regressor,xtrain,xtest,ytrain,ytest,
                      "Ridge Regression")

model3 = model_report(lasso_regressor,xtrain,xtest,ytrain,ytest,
                      "Lasso Regression")

model4 = model_report(elastic_regressor,xtrain,xtest,ytrain,ytest,
                      "Elastic Net Regression")

model5 = model_report(rfr,xtrain,xtest,ytrain,ytest,
                      "Random Forest Regressor")

model_performances = pd.concat([model1,model2,model3,model4,model5],axis = 0).reset_index()
model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)

## <center>---**---End---**---</center>