In [None]:
# Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings('ignore')

In [None]:
# Read dataset
car_price_df = pd.read_csv('CarPrice_Assignment.csv')

In [None]:
car_price_df.head(5)

In [None]:
car_price_df.shape

In [None]:
car_price_df.info()

#### No column has missing values

In [None]:
car_price_df.describe()

In [None]:
# Drop car_ID as it is not reqiured
car_price_df.drop('car_ID', axis=1, inplace=True)

In [None]:
# Get all numerical columns
num_columns = list(filter(lambda x : x not in ['symboling'] and car_price_df[x].dtype in ['float64', 'int64'], car_price_df.columns))

In [None]:
len(num_columns)

### Checking outliers

In [None]:
plt.figure(figsize=(18,30))
for i,col in enumerate(num_columns):
    plt.subplot(5,3,i+1)
    sns.boxplot(y = car_price_df[col])
    
plt.show()

#### Drive Car Company and Model from CarName

In [None]:
carname = car_price_df['CarName'].str.split(' ', n = 1, expand = True)
company = carname[0]

# Drop CarName
car_price_df.drop('CarName', axis = 1, inplace = True)
car_price_df

### Analyzing Car Company

In [None]:
company.value_counts().sort_index()

Here you can see there are some issue in data.There are few companies present with the different spellings the data.
So Let's fix those

<table>
    <tr>
        <td>List of companies</td>
        <td>Final Name</td>
    </tr>
    <tr>
        <td>maxda, mazda</td>
        <td>mazda</td>
    <tr>
        <td>Nissan, nissan</td>
        <td>nissan</td>
    </tr>
    <tr>
        <td>porcshce, porsche</td>
        <td>porsche</td>
    </tr>
    <tr>
        <td>toyota, toyouta</td>
        <td>toyota</td>
    </tr>
    <tr>
        <td>vokswagen, volkswagen, vw</td>
        <td>volkswagen</td>
    </tr>
</table>  

In [None]:
company.replace(to_replace={'maxda': 'mazda', 'Nissan': 'nissan', 'porcshce': 'porsche', 
                            'toyouta': 'toyota', 'vokswagen': 'volkswagen', 'vw': 'volkswagen'}, inplace = True)
company.value_counts().sort_index()

In [None]:
car_price_df['Company'] = company
car_price_df

### Dummy Variables


In [None]:
cat_columns = ['symboling', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype',
              'cylindernumber', 'fuelsystem', 'Company']

In [None]:
for col in cat_columns:
    category = pd.get_dummies(car_price_df[col], drop_first = True)
    category.rename(mapper = lambda x: f'{col}_{x}', axis = 1, inplace = True)
    car_price_df = pd.concat([car_price_df, category], axis = 1)
    car_price_df.drop(col, axis = 1, inplace = True)
    
car_price_df.head(5)

## Splitting the data into Training and Test data set

In [None]:
# import required library
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(car_price_df, train_size = 0.7, test_size = 0.3, random_state = 100)

## Rescalling the features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
df_train.loc[:,num_columns] = scaler.fit_transform(df_train.loc[:,num_columns])
df_train

In [None]:
corr = df_train.corr()
#plt.figure(figsize=(200,300))
#sns.heatmap(corr, annot = True)
#plt.show()
print(corr)

## Dividing into X and y set for model building

In [None]:
y_train = df_train.pop('price')
X_train = df_train

## Building Model

### RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

# Let's start with 12 features
rfe = RFE(lm, 12)
rfe = rfe.fit(X_train, y_train)
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Columns selected through RFE

col = X_train.columns[rfe.support_]
col

## Building model using statsmodel

In [None]:
import statsmodels.api as sm

In [None]:
def build_model(columns):
    X_train_temp = X_train[columns]
    X_train_temp = sm.add_constant(X_train_temp)
    lm = sm.OLS(y_train, X_train_temp).fit()
    print(lm.summary())
    return X_train_temp, lm

In [None]:
X_train_new, lm = build_model(col)

Here R-quared is very high , it may case of **overfitting**

### Checking VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
def calc_vif():
    vif = pd.DataFrame()
    vif['Features'] = X_train_new.columns
    vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = 'VIF', ascending = False)
    print(vif)

In [None]:
calc_vif()

Here you can see, enginetype_rotor has low p-value but VIF is infinite so, lets remove that from Features

### Model - 2

In [None]:
col = col.drop('enginetype_rotor')
X_train_new, lm = build_model(col)

Even though we dropped enginetype_rotor but R-squared is almost same.
Lets check VIF again

In [None]:
calc_vif()

curbweight also has high VIF and even though p-value is low but as other features have even lower p-value. So let's drop curbweight

### Model-3 

In [None]:
col = col.drop('curbweight')
X_train_new, lm = build_model(col)

Again there is not much affect on R-squared value which is good

In [None]:
calc_vif()

If you see 'Company_porsche' it has low VIF but p-value is high as compared to others , So let's drop that and check the results

### Model-4

In [None]:
col = col.drop('Company_porsche')
X_train_new, lm = build_model(col)

**Here also there is not much affect on R-squared, which is good**

In [None]:
calc_vif()

## Residual Analysis

In [None]:
y_train_price = lm.predict(X_train_new)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins=20)
fig.suptitle('Error Terms', fontsize = 20)
plt.xlabel('Errors', fontsize=18)

**Errors are normally distributed**

### Making Predictions

In [None]:
df_test[num_columns] = scaler.transform(df_test[num_columns])

In [None]:
df_test.describe()

In [None]:
# Diving X_test and y_test
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_test = X_test[col]
X_test = sm.add_constant(X_test)

In [None]:
y_pred = lm.predict(X_test)

## Model Prediction


Lets plot graph between actual and predictes values