In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = '/kaggle/input/vehicle-dataset-from-cardekho/car data.csv'
df = pd.read_csv(path)
df.head()

In [None]:
#Removing unnecessary columns
df.drop('Car_Name', axis=1 , inplace=True)
df.head()

In [None]:
from datetime import date
curr_year = date.today().year
curr_year

In [None]:
df['Year'] = curr_year - df['Year']
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#One hot encoding
encoding_col = ['Transmission','Seller_Type','Fuel_Type']

# generate binary values using get_dummies
dum_df = pd.get_dummies(df, columns=encoding_col, drop_first=True)
dum_df.head()

In [None]:
#Pearson's correlation matrix
df_corr =dum_df.corr()
df_corr.head()

In [None]:
#Correlation matrix plot
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 5))
# mask
mask = np.triu(np.ones_like(df_corr, dtype=bool))
# adjust mask and df
mask = mask[1:, :-1]
corr = df_corr.iloc[1:,:-1].copy()
# color map
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", 
           linewidths=5, cmap=cmap, vmin=-1, vmax=1, 
           cbar_kws={"shrink": .8}, square=True)
# ticks
yticks = [i.upper() for i in corr.index]
xticks = [i.upper() for i in corr.columns]
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0)
plt.xticks(plt.xticks()[0], labels=xticks)
# title
title = 'Car Price Prediction'
plt.title(title, loc='left', fontsize=18)
plt.show()

In [None]:
dum_df.columns

In [None]:
#Splitting feature and label variable separately for feature selection
X = dum_df[['Year', 'Present_Price', 'Kms_Driven', 'Owner',
       'Transmission_Manual', 'Seller_Type_Individual', 'Fuel_Type_Diesel',
       'Fuel_Type_Petrol']]
y = dum_df[['Selling_Price']]

#Using extatreeregressor to find highly affecting variable
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X, y.values.ravel())
print(model.feature_importances_)

# Normalizing the individual importances
feature_importance_normalized = np.std([model.feature_importances_ for tree in 
                                        model.estimators_],
                                        axis = 0)
feature_importance_normalized

In [None]:
# Plotting a Bar Graph to compare the models
fig, ax = plt.subplots(figsize=(20, 5))
plt.bar(X.columns, feature_importance_normalized)
plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.show()

In [None]:
features = X
labels = y

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels.values.ravel())

In [None]:
# Use the random forest predict method on the test data
predictions = rf.predict(test_features)
n_row = predictions.shape[0]
predictions = predictions.reshape(n_row,1)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
mae= round(np.mean(errors), 2)
print('Mean Absolute Error:',mae)

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Ignore this code save the model to disk for deployment
# import pickle
# filename = 'fin_random_forest_model.pkl'
# pickle.dump(rf, open(filename, 'wb'))

In [None]:
##comment