**Car Price Prediction**

Problem Statement: A Chinese automobile company aspires to enter the US market by setting up their manufacturing unit there and producing cars locally to give competition to their US and European counterparts.

They have contracted an automobile consulting company to understand the factors on which the pricing of cars depends. Specifically, they want to understand the factors affecting the pricing of cars in the American market, since those may be very different from the Chinese market. The company wants to know:

    Which variables are significant in predicting the price of a car
    How well those variables describe the price of a car

Based on various market surveys, the consulting firm has gathered a large dataset of different types of cars across the Americal market.

Business Goal You are required to model the price of cars with the available independent variables. It will be used by the management to understand how exactly the prices vary with the independent variables. They can accordingly manipulate the design of the cars, the business strategy etc. to meet certain price levels. Further, the model will be a good way for management to understand the pricing dynamics of a new market.


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv('/kaggle/input/car-data/CarPrice_Assignment.csv');
data.head()

In [None]:
#Checking dataset's info
print(data.shape);
data.info()

In [None]:
data['CarName'].unique()

In [None]:
#Too many unique values. Need to split it into single car names
data['CarName']=data['CarName'].apply(lambda x: x.split(' ')[0]);
print(data['CarName'].head());
data['CarName'].unique()

In [None]:
# Similar car names have different spellings;

def Name_replace(name1,name2):
    data['CarName'].replace(name1,name2,inplace=True);

Name_replace('maxda','mazda');
Name_replace('Nissan','nissan');
Name_replace('porcshce','porsche');
Name_replace('toyouta','toyota');
Name_replace('vokswagen','volkswagen');
Name_replace('vw','volkswagen');

data['CarName'].unique()

In [None]:
data.columns

In [None]:
data.skew()

# Creating Functions to help in drawing different types of plots

In [None]:
def scatter_plot(x,figure):
    plt.subplot(5,2,figure);
    plt.scatter(data[x],data['price']);
    plt.title(x+' vs Price');
    plt.ylabel('price');
    plt.xlabel(x);
    plt.xticks(rotation=90);

def count_plot(y,palette):
    fig=plt.subplots(figsize=(20,6));
    sns.countplot(y,data=data,palette=palette);
    plt.xticks(rotation=90);
    
def avg_var_plot(z):
    data1=pd.DataFrame(data.groupby([z])['price'].mean().sort_values(ascending=False)).plot.bar(figsize=(20,8));
    plt.title(z+' vs AvgPrice');
    plt.xticks(rotation=90);
    plt.ylabel('price')

# Visualizing Categorical Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

fig=plt.subplots(figsize=(20,6));
plt.bar(data['CarName'],data['price']);
plt.xlabel('Car Name');
plt.ylabel('Price');
plt.title('Price vs CarName')
plt.xticks(rotation=90);

In [None]:
count_plot('CarName','hls');
avg_var_plot('CarName');
#Toyota is the most preferred choice of car and is less expensive.

In [None]:
count_plot('symboling','viridis');
avg_var_plot('symboling');

In [None]:
count_plot('fueltype','gnuplot');
avg_var_plot('fueltype');

#Gas is the most preferred fuel type and is cheaper than diesel.

In [None]:
count_plot('aspiration','tab10');
avg_var_plot('aspiration');

#Std is most preferred and is cheaper.

In [None]:
count_plot('doornumber','husl');
avg_var_plot('doornumber');

#there is not much difference between the door numbers in terms of price and count

In [None]:
count_plot('carbody','husl');
avg_var_plot('carbody');

#Sedan is the most preferred choice and is cheaper

In [None]:
count_plot('drivewheel','Set2');
avg_var_plot('drivewheel');

#fwd is preferred and is cheap

In [None]:
count_plot('enginelocation','Paired');
avg_var_plot('enginelocation');

#Cars with engine at the front are more in number and relatively cheap

In [None]:
count_plot('enginetype','rocket');
avg_var_plot('enginetype');

#Ohc is the preferred engine type and is the cheapest

In [None]:
count_plot('cylindernumber','mako');
avg_var_plot('cylindernumber');

#cars with 4 cylinder number are in abundance and are relatively cheaper

In [None]:
count_plot('fuelsystem','magma');
avg_var_plot('fuelsystem');

In [None]:
fig=plt.subplots(figsize=(20,6))
sns.distplot(data.price)

In [None]:
fig=plt.subplots(figsize=(20,6));
sns.barplot(x='CarName',y='price',hue='doornumber',data=data)
plt.title('Cars with number of doors each');

In [None]:
data['fueltype'].unique()

In [None]:
fig=plt.subplots(figsize=(20,6));
sns.barplot(x='CarName',y='price',hue='fueltype',data=data);

In [None]:
plt.figure(figsize=(20,8));
sns.boxplot(x=data['symboling'],y=data['price'],palette='gnuplot')
plt.title('Price vs Symboling');

# Visualizing Numeric Data

In [None]:
plt.figure(figsize=(30,15));
scatter_plot('carlength',1);
scatter_plot('carwidth',2);
scatter_plot('carheight',3);
scatter_plot('curbweight',4);
plt.tight_layout();

#Carlength, carwidth, curbweigt show linear positive trend with price whereas carheight show no trend with price;

In [None]:
plt.figure(figsize=(30,15));
scatter_plot('wheelbase',1);
scatter_plot('enginesize',2);
scatter_plot('boreratio',3);
scatter_plot('stroke',4);
plt.tight_layout()
#wheelbase, enginesize, boreratio show positive trend with price;

In [None]:
plt.figure(figsize=(30,15));
scatter_plot('compressionratio',1);
scatter_plot('horsepower',2);
scatter_plot('peakrpm',3);
scatter_plot('citympg',4);
scatter_plot('highwaympg',5);
plt.tight_layout();
#highwaympg, citympg, peakrpm, compression ratio show negative or no trend with price

# After visualizing both categorical and numerical columns it is clear that some columns show negative or no trend in relation to price and have to be dropped

In [None]:
cols_to_drop=['highwaympg','citympg','peakrpm','compressionratio','stroke','doornumber','car_ID','symboling','enginelocation','CarName'];
data.drop(cols_to_drop,axis=1,inplace=True);
data.head()

In [None]:
print(data.shape);
data.info()

In [None]:
sns.pairplot(data)

# Separating Target and Features

In [None]:
y=data['price'];
X=data.drop('price',axis=1,inplace=True);
data.head()

In [None]:
data.describe()

In [None]:
#Splitting the data
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test=train_test_split(data,y,train_size=0.7, test_size=0.3,random_state=0);

#Separating Categorical and Numerical Columns
cat_cols=[cname for cname in X_train.columns if X_train[cname].nunique()<10 and X_train[cname].dtype=='object'];
num_cols=[cname for cname in X_train.columns if X_train[cname].dtype in ['int64','float64']];

my_cols=cat_cols + num_cols;
X_train=X_train[my_cols].copy();
X_test=X_test[my_cols].copy();

In [None]:
X_train['enginetype'].unique()

In [None]:
X_test['enginetype'].unique()

In [None]:
print(X_train['cylindernumber'].unique());
print(X_test['cylindernumber'].unique());
print(X_train['fuelsystem'].unique());
print(X_test['fuelsystem'].unique());

In [None]:
print(X_train.shape);
print(X_test.shape);
print(y_train.shape);
print(y_test.shape);

# Separting categorical columns that are mismatched in train and test datasets

In [None]:
good_label_cols = [col for col in cat_cols if 
                   set(X_train[col]) == set(X_test[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(cat_cols)-set(good_label_cols))

In [None]:
good_label_cols

In [None]:
bad_label_cols

# Label encoding only those columns that match in both train and test dataset

In [None]:
#Label Encoding Categorical Data;
from sklearn.preprocessing import LabelEncoder

label_X_train=X_train.drop(bad_label_cols,axis=1);
label_X_test=X_test.drop(bad_label_cols,axis=1);

encoded=LabelEncoder();
for col in good_label_cols:
    label_X_train[col]=encoded.fit_transform(X_train[col]);
    label_X_test[col]=encoded.transform(X_test[col]);


In [None]:
label_X_train.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model=RandomForestRegressor(n_estimators=100,random_state=0);
model.fit(label_X_train,y_train);
prediction=model.predict(label_X_test);
print(r2_score(prediction,y_test));