Aim of this notebook is, to predict car price.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
cat_features = ['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']
for col in cat_features:
    print('{} : {}'.format(col, df[col].unique()))

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Dropping the carName
final_data = df.drop('Car_Name', axis=1)

In [None]:
final_data.head()

In [None]:
final_data['Car_Age'] = 2020 - final_data['Year']

In [None]:
final_data.head()

In [None]:
# Dropping `Year` from the dataset
final_data.drop('Year', axis=1, inplace=True)

In [None]:
# Creating dummy variables
final_data = pd.get_dummies(final_data, drop_first=True)

In [None]:
final_data.head()

In [None]:
final_data.corr()

In [None]:
sns.pairplot(final_data)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(final_data.corr(), annot=True, cmap='RdYlGn')

In [None]:
X = final_data.iloc[:,1:]
y = final_data.iloc[:,0]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
model = ExtraTreesRegressor()
model.fit(X, y)

In [None]:
print(model.feature_importances_)

In [None]:
# Plotting graph for Feature Importance
feat_importance = pd.Series(model.feature_importances_, index=X.columns)
feat_importance.nlargest(5).plot(kind='barh')
plt.title('Feature Importance')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

In [None]:
import numpy as np

# Number of random forest estimators
n_estimators = [int(x) for x in np.linspace(100, 1200, 12)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in trees
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Creation of Random Grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               scoring='neg_mean_squared_error',
                               n_iter=10,
                               cv=5,
                               verbose=2,
                               random_state=43,
                               n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
predictions = rf_random.predict(X_test)

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test, predictions)

In [None]:
import pickle

file = open('random_forest_regression_model.pkl', 'wb')

# writing information in the file
pickle.dump(rf_random, file)