In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.feature_selection import RFE
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../input/cars-price-prediction/CarPrice_Assignment.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
#  CarName Column contains carname along with the company name

# lets remove company name from it

company_name = data["CarName"].apply(lambda x: x.split(" ")[0])

data.insert(3,"Company_Name",company_name)
data.drop("CarName", axis=1, inplace=True)

In [None]:
# Lets try to find out how many unique companies we are dealing with
print(data.Company_Name.unique())

In [None]:
# wait a min

# As we can see, there are some misspelled company names

# maxda = mazda 
# Nissan = nissan
# porsche = porcshce
# toyota = toyouta
# vokswagen = volkswagen
# vw = volkswagen

# lets fix the data

# as we can see, for nissan we have letter case difference, lets remove it
data.Company_Name = data.Company_Name.str.lower()

# now let's clear this misspelled names

def fix_car_names(wrong_name, correct_name):
    data.Company_Name.replace(wrong_name,correct_name,inplace=True)
    
fix_car_names("maxda","mazda")    
fix_car_names("porcshce","porsche")    
fix_car_names("toyouta","toyota")       
fix_car_names("vokswagen","volkswagen")  
fix_car_names("vw","volkswagen") 

In [None]:
# Now lets check our changes

data.Company_Name.unique()


In [None]:
# Lets check if out dataset contains any Duplicate

data.duplicated().sum()

In [None]:
# Now let's analyze the car price distribution

plt.figure(figsize=(20,10))

plt.subplot(1,2,1)
plt.title("Distribution of Car Price")
sns.distplot(data.price)

plt.subplot(1,2,2)
plt.title("Car Price Spreading")
sns.boxplot(x=data.price)

In [None]:
data.price.describe(percentiles=[0,0.15,0.25, 0.50,0.60,0.6303,0.70,0.75,0.80,0.85,0.90,0.95,1])

In [None]:
print("Mean of Car Price : {} ".format(np.mean(data.price)))
print("Median of Car Price : {} ".format(np.median(data.price)))

In [None]:
# Observations :

# 1 --> there is a significant difference between mean and median of car price Distribution.

# 2 --> Car prices and between Min price = 5118 and max Price=45400.

# 3 --> The plot seemed to be right-skewed, meaning that the most prices are low (63.03% car prices are below mean price range)

# 4 --> price are spread far from the mean which indiacates high variance in dataset  

In [None]:
plt.figure(figsize=(30,40))

plt.subplot(2,2,1)
company_name = data.Company_Name.value_counts().plot(kind="bar")
plt.title("Company Histogram")
company_name.set(xlabel="Car Company",ylabel = "Cars Sold")

plt.subplot(2,2,2)
fuel_type = data.fueltype.value_counts().plot(kind="bar")
plt.title("Fuel Type Histogram")
company_name.set(xlabel="Fuel type",ylabel = "Cars Sold")


plt.subplot(2,2,3)
body_type = data.carbody.value_counts().plot(kind="bar")
plt.title("Car Body Type Histogram")
body_type.set(xlabel="Body Type",ylabel="Cars Sold")

plt.subplot(2,2,4)
fuel_system = data.fuelsystem.value_counts().plot(kind="bar")
plt.title("Fuel System Histogram")
fuel_system.set(xlabel="Fuel System",ylabel="Cars Sold")

plt.show()

In [None]:
# Observation : 

# 1 --> Toyota sold most cars
# 2 --> Cars with Fuel type as gas is customers favourite choice
# 3 --> Customers favourite car body type is Sedan
# 4 --> MPFI is the most sold Fuel System

In [None]:
    data.head()

In [None]:
plt.figure(figsize=(25,30))
plt.title("Company name vs Price")
sns.boxplot(x = data["Company_Name"], y=data["price"])

In [None]:
# Observations

# 1 --> BMW has a highest price range
# 2 --> Chevrolet is cheap as comapre to others
# 3 --> Buick, Jaguar and porsche have high price cars

In [None]:
plt.figure(figsize=(15,8))

plt.subplot(1,2,1)
symbol = data.symboling.value_counts().plot(kind="bar")
plt.title("Symboling Histogram")
symbol.set(xlabel="Symbol", ylabel="Cars sold")

plt.subplot(1,2,2)
plt.title("Symbol vs Price")
sns.boxplot(x = data.symboling, y= data.price)


In [None]:
# Observation :

# 1 --> cars with symbol 0 and 1 are most sold
# 2 --> cars with symbol -1 are available in a large price range than others also this group has highest car price
# 3 --> cars with symbol 1 are cheaper than the others

In [None]:
plt.figure(figsize=(20, 8))

company = pd.DataFrame(data.groupby("Company_Name")["price"].mean().sort_values(ascending=False))

fuel_type = pd.DataFrame(data.groupby("fueltype")["price"].mean().sort_values(ascending=False))

Body_type = pd.DataFrame(data.groupby("carbody")["price"].mean().sort_values(ascending=False))


company.plot.bar()
plt.title("Company Name VS Avg Price")
plt.show()


fuel_type.plot.bar()
plt.title("Fuel Type VS Avg Price")
plt.show()


Body_type.plot.bar()
plt.title("Car Body VS Avg Price")
plt.show()

In [None]:
# Observations

# 1 --> Jaguar, porsche, buick have highest avg price
# 2 --> DIsel cars have high avg prices
# 3 --> Hardtop, Convertable body type cars are the most sold

In [None]:
def plot_count(x,fig):
    plt.subplot(4,2,fig)
    plt.title(x+' Histogram')
    sns.countplot(data[x],palette=("magma"))
    plt.subplot(4,2,(fig+1))
    plt.title(x+' vs Price')
    sns.boxplot(x=data[x], y=data.price, palette=("magma"))

    
plt.figure(figsize=(15,20))

plot_count('enginelocation', 1)
plot_count('cylindernumber', 3)
plot_count('fuelsystem', 5)
plot_count('drivewheel', 7)

In [None]:
# Observations
#  1--> Cars with rear engine location have high price.
# 2 --> Most common number of cylinders are four, six and five. Though eight cylinders have the highest price range.
# 3 --> mpfi and 2bbl are most common type of fuel systems. mpfi and idi having the highest price range. But there are few data for other categories to derive any meaningful inference
# 4 --> A very significant difference in drivewheel category. Most high ranged cars seeme to prefer rwd drivewheel.


In [None]:
print(data.columns)
data.head()

In [None]:
# Now Lets Analyze Numeriical Data

plt.figure(figsize=(12,20))

plt.subplot(3,2,1)
plt.scatter(x = data.carlength, y =data.price)
plt.xlabel("Car Length")
plt.ylabel("Price")
plt.title("Carlength vs Price")

plt.subplot(3,2,2)
plt.scatter(x = data.carheight, y =data.price)
plt.xlabel("Car Height")
plt.ylabel("Price")
plt.title("Car Height vs Price")

plt.subplot(3,2,3)
plt.scatter(x = data.carwidth, y =data.price)
plt.xlabel("Car Width")
plt.ylabel("Price")
plt.title("Car width vs Price")

plt.subplot(3,2,4)
plt.scatter(x = data.curbweight, y =data.price)
plt.xlabel("Curb Weight")
plt.ylabel("Price")
plt.title("Curb weight vs Price")

plt.subplot(3,2,5)
plt.scatter(x = data.enginesize, y =data.price)
plt.xlabel("Engine Size")
plt.ylabel("Price")
plt.title("Engine Size vs Price")

plt.subplot(3,2,6)
plt.scatter(x = data.horsepower, y =data.price)
plt.xlabel("Horse Power")
plt.ylabel("Price")
plt.title("Power vs Price")

In [None]:
# Observations :

# 1 --> Car Height does not show any significance with price
# 2--> carwidth, carlength, curbweight, engine size and horse power seems to have a poitive correlation with price. 

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
plt.scatter(x = data.citympg, y =data.price)
plt.xlabel("City mpg")
plt.ylabel("Price")
plt.title("City mpg vs Price")

plt.subplot(1,2,2)
plt.scatter(x = data.highwaympg, y =data.price)
plt.xlabel("Highway mpg")
plt.ylabel("Price")
plt.title("Highway mpg vs Price")

In [None]:
# Observations :
# 1--> City mpg and Highway mpg seems to have negative significance with price

In [None]:
car_range = data.groupby(["fuelsystem","drivewheel"])["price"].mean().sort_values(ascending=False)

car_range.plot.bar()
plt.title("Car range vs price")

In [None]:
# Observation :

# 1 --> customers mostly prefer car with idi,fwd or mpfi,rwd

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
plt.scatter(x = data.boreratio, y =data.price)
plt.xlabel("Bore ratio")
plt.ylabel("Price")
plt.title("Bore ratio vs Price")

plt.subplot(1,2,2)
plt.scatter(x = data.wheelbase, y =data.price)
plt.xlabel("Wheel base")
plt.ylabel("Price")
plt.title("Wheel Base vs Price")

In [None]:
# Observations

# Bore Ration and Wheel base seems to have positive corelation with Price

In [None]:
plt.figure(figsize=(15,4))

plt.subplot(1,2,1)
plt.scatter(x = data.stroke, y =data.price)
plt.xlabel("Stroke")
plt.ylabel("Price")
plt.title("Stroke vs Price")

plt.subplot(1,2,2)
plt.scatter(x = data.compressionratio, y =data.price)
plt.xlabel("Compression Ratio")
plt.ylabel("Price")
plt.title("Compression Ratio vs Price")

In [None]:
# Observations

# Stoke and Compression doesnt seem to have correlation with price

In [None]:
# After Examination of all the features, we observed that following are the features that seems to have significance with price

In [None]:
# -Engine Type 
# - Fuel Type 
# - Car Body 
# - Aspiration 
# - Cylinder Number 
# - Drivewheel 
# - Curbweight 
# - Car Length
# - Car width
# - Engine Size 
# - Boreratio 
# - Horse Power 
# - Wheel base 
# - highwaympg
# - peakrpm

In [None]:
cars_data = data[['price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase',
                  'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower', 
                    'peakrpm','highwaympg', 'carlength','carwidth']]

In [None]:
cars_data.head()

In [None]:
def get_dummies(col,data):
    temp = pd.get_dummies(data[col],drop_first=True)
    data = pd.concat([data,temp],axis=1)
    data.drop([col],axis=1,inplace=True)
    return data

In [None]:
cars_data = get_dummies('fueltype',cars_data)
cars_data = get_dummies('aspiration',cars_data)
cars_data = get_dummies('carbody',cars_data)
cars_data = get_dummies('drivewheel',cars_data)
cars_data = get_dummies('enginetype',cars_data)
cars_data = get_dummies('cylindernumber',cars_data)


In [None]:
cars_data.head()

In [None]:
# So we have created dummy variables for fueltype, aspiration,carbody,drivewheel,enginetype,cylindernumbers
# Now we will normalize the remaining columns [ 'price','wheelbase','curbweight', 'enginesize', 'boreratio','horsepower', 'peakrpm','highwaympg', 'carlength','carwidth']

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler()

rem_cols = ['wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower','peakrpm','highwaympg','carlength','carwidth']

cars_data[rem_cols] = scale.fit_transform(cars_data[rem_cols])

In [None]:
cars_data.head()

In [None]:
# Now we will split you train and test set

X = cars_data.drop("price",axis=1)
Y = cars_data["price"]


x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=100)

In [None]:
# RFE [Recursive Feature Elimination] and Linear Regression
#Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached.
LinReg = LinearRegression()
LinReg.fit(x_train,y_train)

rfe = RFE(LinReg,10)
rfe.fit(x_train,y_train)

In [None]:
list(zip(x_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# RFE is having support for the below columns
x_train.columns[rfe.support_]

In [None]:
x_train_rfe = x_train[x_train.columns[rfe.support_]]
x_train_rfe.head()

In [None]:
def build_model(X,Y):
    X = sm.add_constant(X)
    LinReg = sm.OLS(Y,X).fit()
    print(LinReg.summary())
    return X

In [None]:
# # Variance Inflation Factor

# # This helps us to detect multocollinearity in regression, s
# # Multicollinearity is when there’s correlation between predictors (i.e. independent variables) in a model;
# # it’s presence can adversely affect your regression results.
# The VIF estimates how much the variance of a regression coefficient is inflated due to multicollinearity in the model. 
#Mathematically, the VIF for a regression model variable is equal to the ratio of the overall model variance to the variance of a model that includes only that single independent variable.

def detect_VIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
    vif["VIF"] = round(vif["VIF"],2)
    vif = vif.sort_values(by="VIF",ascending=False)
    return vif

In [None]:
X_train_new = build_model(x_train_rfe,y_train)

In [None]:
# ohcf seems to have higher significance than the oathers. hence dropping it as it is insignificant in presence of other features

In [None]:
x_train_new = x_train_rfe.drop(["ohcf"],axis=1)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
x_train_new = x_train_new.drop(["wheelbase"],axis=1)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
x_train_new =x_train_new.drop(["highwaympg"],axis=1)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
# calculate VIF

detect_VIF(x_train_new)

In [None]:
# Sedan seems to have high multicollinearity. Hence dropping it

x_train_new = x_train_new.drop("sedan",axis=1)

In [None]:
x_train_new = build_model(x_train_new, y_train)

In [None]:
x_train_new = x_train_new.drop("hardtop",axis=1)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
detect_VIF(x_train_new)

In [None]:
# Dropping curbweight to check the changes

x_train_new = x_train_new.drop("curbweight",axis=1)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
x_train_new = x_train_new.drop("highwaympg",axis=1)
detect_VIF(x_train_new)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
x_train_new = x_train_new.drop("wagon",axis=1)
detect_VIF(x_train_new)

In [None]:
x_train_new = build_model(x_train_new,y_train)

In [None]:
# So our model with these feartures looks good
# so we will design our model with the above features

LinReg = sm.OLS(y_train,x_train_new).fit()

price_predictions = LinReg.predict(x_train_new)

In [None]:
sns.distplot((y_train-price_predictions),bins=20)

In [None]:
# Error terms seem to be approximately normally distributed, so our selected features looks good for model

In [None]:
# Now lets make predictions
x_train_new.head()

In [None]:
x_train_new = x_train_new.drop("const",axis=1)

In [None]:
x_test_new = x_test[x_train_new.columns]

In [None]:
x_test_new = sm.add_constant(x_test_new)
x_test_new.head()

In [None]:
predicted_car_price = LinReg.predict(x_test_new)

In [None]:
# Checking the fitness of the curve

r2_score(y_test,predicted_car_price)

In [None]:
plt.scatter(y_test,predicted_car_price)
plt.xlabel("y_test")
plt.ylabel("predicted car price")
plt.title("y_test vs predicted car price")

In [None]:
print(LinReg.summary())