In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")

In [None]:
df.head()

In [None]:
df.drop(labels=['car_ID','CarName'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df_quant_cols = df[['symboling','wheelbase','carlength','carwidth','carheight','curbweight',
                  'enginesize','boreratio','stroke','compressionratio','horsepower',
                  'peakrpm','citympg','highwaympg','price']]
df_cat_cols = df.drop(df_quant_cols.columns,axis=1)

print(df_quant_cols.columns, df_cat_cols.columns, sep="\n")

# Uni-Variate Analysis

In [None]:
# Analyzing Quantitative variables.
plt.figure(figsize=(30, 30))
sns.set(font_scale=1.5)
ind = 1

for col in df_quant_cols.columns:
    plt.subplot(5, 3, ind)
    sns.boxplot(x=df_quant_cols[col])
    ind += 1

In [None]:
# Analyzing categorical variables.
plt.figure(figsize=(30, 30))
sns.set(font_scale=1.5)
ind = 1

for col in df_cat_cols.columns:
    plt.subplot(3, 3, ind)
    sns.countplot(x=df_cat_cols[col])
    ind += 1
    
sns.set(font_scale=1)


# Insights from categorical features visualization
1. Car names are widely distributed.
2. Fueltype gas is having high proportion than diesel(~12%).
3. Sedan type cars are highly preferred.
4. 4-wheel drives are hardly used.
5. Engine location - Almost every car has front engine.
6. Engine type - ohc is vastly used than any other types.
7. Cylinder numbers - cars with 4 cylinders are highly used.
8. Fuel system - mpfi, 2bbl are widely used.

In [None]:
# Checking target variable's distribution.
sns.distplot(df.price)
print("Skew of target variable:",df.price.skew())

Target variable is right skewed.

# Bi-Variate analysis

In [None]:
plt.figure(figsize=(30, 30))
sns.set(font_scale=1.5)
ind = 1

for col in df_cat_cols.columns:
    plt.subplot(3, 3, ind)
    sns.barplot(x=df[col], y=df.price)
    ind += 1

sns.set(font_scale=1)


# Insights from categorical variables which are influencing target variable
1. Fueltype - Diesel cars are slightly high in cost than gas.
2. Aspiration - Turbo is high in cost than ordinary.
3. Car type - Convertible and hardtop are costlier than others.
4. Engine location - Cars with rear engines are highly expensive.
5. Cylinder number - 3 cylinder cars are very cheaper.

In [None]:
fig,ax = plt.subplots(figsize=(15,8))
sns.heatmap(df_quant_cols.corr(), annot=True)


# Insights from Quantitative variables which are influencing target variable
1. Symboling, Carheight, stroke, Compressionratio, peakrpm are not correlated with target variable(Price).
2. We are having multicollinearity.

In [None]:
# Using get_dummies to do one-hot encoding.
df_cat_cols = pd.get_dummies(data=df_cat_cols,drop_first=False)
df_cat_cols.columns

In [None]:
df_quant_cols = df_quant_cols.drop('price',axis=1)
df_quant_cols.columns

In [None]:
# Joining both Quantitative and Categorical variables.
df_new = df_quant_cols.join(df_cat_cols,how='right')

In [None]:
def print_regressor_scores(regr):
    global X, y, X_train, X_test, y_train, y_test
    # Prediction with training dataset:
    y_pred_DTR_train = regr.predict(X_train)

    # Prediction with testing dataset:
    y_pred_DTR_test = regr.predict(X_test)

    # Find training accuracy for this model:
    accuracy_DTR_train = r2_score(y_train, y_pred_DTR_train)
    print("Training Accuracy for Decision Tree Regression Model: ", accuracy_DTR_train)

    # Find testing accuracy for this model:
    accuracy_DTR_test = r2_score(y_test, y_pred_DTR_test)
    print("Testing Accuracy for Decision Tree Regression Model: ", accuracy_DTR_test)

    # Find RMSE for training data:
    RMSE_DTR_train = sqrt(mean_squared_error(y_train, y_pred_DTR_train))
    print("RMSE for Training Data: ", RMSE_DTR_train)

    # Find RMSE for testing data:
    RMSE_DTR_test = sqrt(mean_squared_error(y_test, y_pred_DTR_test))
    print("RMSE for Testing Data: ", RMSE_DTR_test)

    # Prediction with 10-Fold Cross Validation:
    y_pred_cv_DTR = cross_val_predict(regr, X, y, cv=10)

    # Find accuracy after 10-Fold Cross Validation
    accuracy_cv_DTR = r2_score(y, y_pred_cv_DTR)
    print("Accuracy for 10-Fold Cross Predicted Decision Tree Regression Model: ", accuracy_cv_DTR)

In [None]:
def data_preprocess(x,y,std_scale=False,min_max_scale=False):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
    
    if std_scale or min_max_scale:
        if std_scale:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        else:
            scaler = MinMaxScaler(copy=True,feature_range=(0,1))
        scaler.fit(x_train)
        train_scaled = scaler.transform(x_train)
        test_scaled = scaler.transform(x_test)
        return(train_scaled, test_scaled, y_train, y_test)
    else:
        return(x_train, x_test, y_train, y_test)

In [None]:
def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return(vif)

# Model - 1

Linear Regression - All features and outliers

In [None]:
X = df_new
y = df.price

In [None]:
X_train,X_test,y_train,y_test = data_preprocess(X,y)

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)
print("Train Score :", reg.score(X_train,y_train))
print("Test Score :", reg.score(X_test,y_test))

In [None]:
y_pred = reg.predict(X_train)
df_residuals = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})
df_residuals.insert(2,"ErrorTerm",(df_residuals.Actual - df_residuals.Predicted))

In [None]:
sns.distplot(df_residuals.ErrorTerm)
print("Residual mean :", df_residuals.ErrorTerm.mean())

In [None]:
X_endog = sm.add_constant(X_train)
res = sm.OLS(y_train, X_endog)
res.fit().summary()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

# Model - 2

Linear Regression after filtering features from previous model.

In [None]:
X = df_new[['curbweight','enginesize']]
y = df.price

In [None]:
X_train,X_test,y_train,y_test = data_preprocess(X,y)

In [None]:
reg=LinearRegression()
reg.fit(X_train,y_train)
print("Train Score :", reg.score(X_train,y_train))
print("Test Score :", reg.score(X_test,y_test))

In [None]:
y_pred = reg.predict(X_train)
df_residuals = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})
df_residuals.insert(2,"ErrorTerm",(df_residuals.Actual - df_residuals.Predicted))

In [None]:
sns.distplot(df_residuals.ErrorTerm)
print("Residual mean :", df_residuals.ErrorTerm.mean())

In [None]:
X_endog = sm.add_constant(X_train)
res = sm.OLS(y_train, X_endog)
res.fit().summary()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

In [None]:
calc_vif(X)

In [None]:
X.join(y).corr()

After Doing multiple iterations on filtering features based on their significance of impact on target feature, We come to a conclusion that Linear regression is not doing well with scores


# Model - 3

# Decision Tree Regressor

In [None]:
X = df_new
y = df.price

In [None]:
X_train,X_test,y_train,y_test = data_preprocess(X,y)

In [None]:
dtree_reg = DecisionTreeRegressor(random_state=0)
dtree_reg.fit(X_train, y_train)

In [None]:
print_regressor_scores(dtree_reg)

# Post Pruning

In [None]:
path = dtree_reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [dtree_reg.tree_.node_count for dtree_reg in clfs]
depth = [dtree_reg.tree_.max_depth for dtree_reg in clfs]
fig, ax = plt.subplots(1, 2,figsize=(20,8))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")

ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")

In [None]:
train_scores = [dtree_reg.score(X_train, y_train) for dtree_reg in clfs]
test_scores = [dtree_reg.score(X_test, y_test) for dtree_reg in clfs]

fig, ax = plt.subplots(figsize=(15,5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
dtree_reg = DecisionTreeRegressor(random_state=0,ccp_alpha=0.55e+06)
dtree_reg.fit(X_train, y_train)

In [None]:
print_regressor_scores(dtree_reg)


# Model - 5

Random forest regressor

In [None]:
X = df_new
y = df.price

In [None]:
X_train,X_test,y_train,y_test = data_preprocess(X,y)

In [None]:
n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5, 6, 8, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]
oob_score = [True, False]

params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'oob_score': oob_score}


rf_reg = RandomForestRegressor(random_state=0)

rf_reg = GridSearchCV(rf_reg, params_grid, cv=3, verbose=2, n_jobs=-1)

rf_reg.fit(X_train, y_train)
best_params = rf_reg.best_params_
print(f"Best parameters: {best_params}")

rf_reg = RandomForestRegressor(**best_params)
rf_reg.fit(X_train, y_train)

In [None]:
print_regressor_scores(rf_reg)