In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score,accuracy_score
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import VotingRegressor


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/car-price-prediction/CarPrice_Assignment.csv')
data.head(5)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isna().sum()/len(data)*100,2

In [None]:
data.describe()

In [None]:
data.hist(edgecolor ='black',linewidth = 1.2,figsize =(20,20));

In [None]:
cat_col = data[['fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']]

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(30, 30))

for i, column in enumerate(cat_col, 1):
    plt.subplot(3, 3, i)
    g = sns.barplot(x=f"{column}", y='price', data=data)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    plt.ylabel('price',fontsize = 30)
    plt.xlabel(f'{column}',fontsize = 30)

## Insights from categorical features visualization
- 1) Gas type car has high price as comparaed to the diesel
- 2) car with turbo type aspiration has high price
- 3) doornumber has no significant difference
- 4) hardtop and convertable body type cars are expensive than others.
- 5) price of cars with rwd drivewheel's is almost twice the price of cars with fwd and 4wd.
- 6) front engine cars are very expensive than the rear once.
- 7) cars having 3 cylinders are very cheaper than  the  price of other types.
- 8) cost for the cars with 1bbl and 2bbl fuelsystem are almost the same.
- 9)'ohc', 'l', 'rotor','ohcf',enginetype cars prices are nearly same and twice lesser than the price of 'dohcv'enginetype.

In [None]:
# Renaming the typo errors in Car Company names

data['CarName'] = data['CarName'].replace({'maxda': 'mazda', 'nissan': 'Nissan', 'porcshce': 'porsche', 'toyouta': 'toyota', 
                            'vokswagen': 'volkswagen', 'vw': 'volkswagen'})

In [None]:
data['CarName'].value_counts()

In [None]:
corrmat = data.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
sns.heatmap(corrmat, annot=True)

#### It is evident that our independent features are multicolinear.

In [None]:
one_hot_encoded_df = pd.get_dummies(cat_col)
one_hot_encoded_df

In [None]:
num_cat = data.drop(['car_ID','CarName','fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','cylindernumber','fuelsystem','enginetype'],axis = 1)

In [None]:
num_cat.reset_index(drop=True, inplace=True)
one_hot_encoded_df.reset_index(drop=True, inplace=True)

In [None]:
final_df = pd.concat([num_cat,one_hot_encoded_df], axis=1)

In [None]:
final_df.head()

In [None]:
X = final_df.drop('price',axis=1)
y = final_df['price']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 21)

In [None]:
bag_reg = BaggingRegressor()
dt_reg  = DecisionTreeRegressor()
rf_reg  = RandomForestRegressor()

## Model:1 (Ensemble.Bagging_regressor)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

bag_reg = BaggingRegressor(DecisionTreeRegressor(),   # here we decided the tree as predictor ans takeb as 500, bootstrap = True means we have selected bagging(without replacement) 
                          n_estimators = 500,   # max_saples = 1 means all the data is taken(pasting regressor)
                          bootstrap = True,
                          max_samples = 1.0,
                          n_jobs  = -1)

In [None]:
bag_reg.fit(X_train,y_train)

In [None]:
y_pred = bag_reg.predict(X_test)

df = pd.DataFrame({'Actual':y_test,
                  'Predicted':y_pred})
df.head()

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)
print("predict score",score)

In [None]:
y_pred_train = bag_reg.predict(X_train)
score = r2_score(y_pred_train,y_train)
print('train prediction score',score)

In [None]:
# from sklearn import metrics
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(20,10))
plt.plot(y_pred,label = 'Predicted')
plt.plot(y_test.values,label = 'Actual')

plt.ylabel('price',fontsize = 10)
plt.legend()
plt.show()

## Model:2 (DT_regressor)

In [None]:
from sklearn.tree import DecisionTreeRegressor  # Import Decision Tree Regression model

decision_tree_reg = DecisionTreeRegressor(max_depth=5, random_state=21)  # Create a instance for Decision Tree Regression model
decision_tree_reg.fit(X_train, y_train)  # Fit data to the model

In [None]:
# Prediction with training dataset:
y_pred_DTR_train = decision_tree_reg.predict(X_train)

# Prediction with testing dataset:
y_pred_DTR_test = decision_tree_reg.predict(X_test)

# Find training accuracy for this model:
accuracy_DTR_train = r2_score(y_train, y_pred_DTR_train)
print("Training Accuracy for Decision Tree Regression Model: ", accuracy_DTR_train)

# Find testing accuracy for this model:
accuracy_DTR_test = r2_score(y_test, y_pred_DTR_test)
print("Testing Accuracy for Decision Tree Regression Model: ", accuracy_DTR_test)

# Find RMSE for training data:
RMSE_DTR_train = sqrt(mean_squared_error(y_train, y_pred_DTR_train))
print("RMSE for Training Data: ", RMSE_DTR_train)

# Find RMSE for testing data:
RMSE_DTR_test = sqrt(mean_squared_error(y_test, y_pred_DTR_test))
print("RMSE for Testing Data: ", RMSE_DTR_test)

## Model:3 (RF_regressor)


In [None]:
from sklearn.ensemble import RandomForestRegressor  # Import Random Forest Regression model

random_forest_reg = RandomForestRegressor(n_estimators=1500, max_depth=5, random_state=21)  # Create a instance for Random Forest Regression model
random_forest_reg.fit(X_train, y_train)  # Fit data to the model

In [None]:
random_forest_reg = RandomForestRegressor(n_estimators=1500, max_depth=5, random_state=21)  # Create a instance for Random Forest Regression model
random_forest_reg.fit(X_train, y_train)  # Fit data to the model

# Prediction with training dataset:
y_pred_RFR_train = random_forest_reg.predict(X_train)

# Prediction with testing dataset:
y_pred_RFR_test = random_forest_reg.predict(X_test)

# Find training accuracy for this model:
accuracy_RFR_train = r2_score(y_train, y_pred_RFR_train)
print("Training Accuracy for Random Forest Regression Model: ", accuracy_RFR_train)

# Find testing accuracy for this model:
accuracy_RFR_test = r2_score(y_test, y_pred_RFR_test)
print("Testing Accuracy for Random Forest Regression Model: ", accuracy_RFR_test)

# Find RMSE for training data:
RMSE_RFR_train = sqrt(mean_squared_error(y_train, y_pred_RFR_train))
print("RMSE for Training Data: ", RMSE_RFR_train)

# Find RMSE for testing data:
RMSE_RFR_test = sqrt(mean_squared_error(y_test, y_pred_RFR_test))
print("RMSE for Testing Data: ", RMSE_RFR_test)

## Tuning

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]

params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}


rf_clf = RandomForestRegressor(random_state=42)

rf_cv = GridSearchCV(rf_clf, params_grid, scoring="neg_mean_squared_error", cv=3, verbose=2, n_jobs=-1)


rf_cv.fit(X_train, y_train)
best_params = rf_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestRegressor(**best_params)
rf_clf.fit(X_train, y_train)

In [None]:
# Prediction with training dataset:
y_pred_RFR_train = rf_clf.predict(X_train)

# Prediction with testing dataset:
y_pred_RFR_test = rf_clf.predict(X_test)

# Find training accuracy for this model:
accuracy_RFR_train = r2_score(y_train, y_pred_RFR_train)
print("Training Accuracy for Random Forest Regression Model: ", accuracy_RFR_train)

# Find testing accuracy for this model:
accuracy_RFR_test = r2_score(y_test, y_pred_RFR_test)
print("Testing Accuracy for Random Forest Regression Model: ", accuracy_RFR_test)

# Find RMSE for training data:
RMSE_RFR_train = sqrt(mean_squared_error(y_train, y_pred_RFR_train))
print("RMSE for Training Data: ", RMSE_RFR_train)

# Find RMSE for testing data:
RMSE_RFR_test = sqrt(mean_squared_error(y_test, y_pred_RFR_test))
print("RMSE for Testing Data: ", RMSE_RFR_test)

## Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
regressor_1 = bag_reg
regressor_2 = decision_tree_reg
regressor_3 = rf_clf

# regressor_1.fit(X_train,y_train)
# regressor_2.fit(X_train,y_train)
# regressor_3.fit(X_train,y_train)
vt_reg  = [('Bagging_regressor', regressor_1), ('DT_regressor', regressor_2), ('RF_regressor', regressor_3)]
vr = VotingRegressor(estimators=vt_reg)
vr.fit(X_train,y_train)

In [None]:
print("Bagging Regressor Train score:",regressor_1.score(X_train,y_train))
print("Bagging Regressor Test score:",regressor_1.score(X_test,y_test))

print("Decision Tree Train score:",regressor_2.score(X_train,y_train))
print("Decision Tree Test score:",regressor_2.score(X_test,y_test))

print("Random Forest Train score:",regressor_3.score(X_train,y_train))
print("Random Forest Test score:",regressor_3.score(X_test,y_test))

print("Voting Regressor Train score:",vr.score(X_train,y_train))
print("Voting Regressor Test score:",vr.score(X_test,y_test))

In [None]:
pred_1 = regressor_1.predict(X_test)
pred_2 = regressor_2.predict(X_test)
pred_3 = regressor_3.predict(X_test)
pred_4 = vr.predict(X_test)

### Finally, we will visualize the predictions. The red stars show the average prediction made by VotingRegressor.

In [None]:
plt.figure()
plt.plot(pred_1, 'gd', label='BaggingRegressor')
plt.plot(pred_2, 'b^', label='DecisionTreeRegressor')
plt.plot(pred_3, 'ys', label='RandomForestRegressor')
plt.plot(pred_4, 'r*', ms=10, label='VotingRegressor')

plt.tick_params(axis='x', which='both', bottom=False, top=False,
                labelbottom=False)
plt.ylabel('predicted')
plt.xlabel('training samples')
plt.legend(loc="best")
plt.title('Regressor predictions and their average')

plt.show()