In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV,RandomizedSearchCV
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set(rc={'figure.figsize':(8,8)})
import scipy.stats as stats

In [None]:
data=pd.read_csv('../input/car-price-prediction/CarPrice_Assignment.csv')

In [None]:
data.head()

In [None]:
data['CarName'] = data['CarName'].replace({'maxda': 'mazda', 'nissan': 'Nissan', 'porcshce': 'porsche', 'toyouta': 'toyota', 
                            'vokswagen': 'volkswagen', 'vw': 'volkswagen'})

In [None]:
quantitative = [f for f in data.columns if data.dtypes[f] != 'object']  ## Quantitative variables
quantitative.remove('price')
quantitative.remove('car_ID')
qualitative = [f for f in data.columns if data.dtypes[f] == 'object']  ## Categorial variables

In [None]:
missing = data.isnull().sum()  ##data is pandas data frame 
missing = missing[missing > 0]
missing.sort_values(inplace=True)
#missing.plot.bar()  #here none of the values are missing otherwise uncomment this line
print (missing)

In [None]:
import scipy.stats as st
y=data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)


Johnson SU is best fit

Now, let's check the distribution of all other numerical parameters with their respective values

In [None]:
f = pd.melt(data, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

Similarly, One more good way **Boxplot** of representation of categorial variable with respect to prediction paramter which is price in our case can be found below

In [None]:
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)

In [None]:
f = pd.melt(data, id_vars=['price'], value_vars=qualitative)


In [None]:
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")

In [None]:
def anova(frame):
    anv = pd.DataFrame()
    anv['feature'] = qualitative
    pvals = []
    for c in qualitative:
        samples = []
        for cls in frame[c].unique():
            s = frame[frame[c] == cls]['price'].values
            samples.append(s)
        pval = stats.f_oneway(*samples)[1]
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

In [None]:
a = anova(data)
a['disparity'] = np.log(1./a['pval'].values)
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)

In [None]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()  ## ALl unique values of categorial variables
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'price']].groupby(feature).mean()['price']  ## Group feature by mean sale price
    ordering = ordering.sort_values('spmean')  ## Sort feature by mean saleprice
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o


In [None]:
qual_encoded = []
for q in qualitative:  
    encode(data, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)

In [None]:
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['price'], 'spearman') for f in features]  #Correlation of all features with Price
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')  ## Horizontal bar plot of comparison of all correlation
    

**Spearman plot** provides the importance by means of correlation
Here, important thing to mark is *more negatively correlated is also a good correlation* 

In [None]:
features = quantitative + qual_encoded
spearman(data, features)

Hence this shows that  Cylinder number, Car name, Width, length Wheelbase, fuelsystem... etc are good paramters to look at for car price prediction which makes sense even

Where as number of doors, stroke and engine location would not be that important

**Heatmap** plot is another good way to represent the correlations

Below we're showing three different plots as below
1. Numerical correlation with price
2. Categorial correlation with price
3. Major parameters correlation with price

In [None]:
plt.figure(1)
corr = data[quantitative+['price']].corr()  # Correlation of numberical
sns.heatmap(corr)
plt.figure(2)
corr = data[qual_encoded+['price']].corr()  # Correlation of categorial 
sns.heatmap(corr)
plt.figure(3)
corr = pd.DataFrame(np.zeros([len(quantitative)+1, len(qual_encoded)+1]), index=quantitative+['price'], columns=qual_encoded+['SalePrice'])
for q1 in quantitative+['price']:
    for q2 in qual_encoded+['price']:
        corr.loc[q1, q2] = data[q1].corr(data[q2])
sns.heatmap(corr)

In [None]:
pair = sns.pairplot(data[quantitative])  