In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

LOADING THE DATASET

In [None]:
data = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')
data.head()

SHAPE OF THE DATASET

In [None]:
data.shape

CHECKING FOR NULL VALUES

In [None]:
data.isnull().sum()

CONVERTING THE YEAR COLUMN TO NO.OF YEARS USED

In [None]:
data['current year'] = 2021
data['no.of years used'] = data['current year'] - data['Year']
data.head()

REMOVING THE COLUMS WHICH ARE NOT GOING TO BE USED

In [None]:
data.drop(['Car_Name','Year','current year'],1,inplace = True)


In [None]:
data.head()

CHECKING FOR CATEGORICAL COLUMNS

In [None]:
print(data['Fuel_Type'].unique())
print(data['Seller_Type'].unique())
print(data['Transmission'].unique())
print(data['Owner'].unique())


USING "ONE HOT ENCODING" 

In [None]:
data = pd.get_dummies(data,drop_first=True)
data.head()

SEPARATING THE FEATURES AND TARGET LABEL

In [None]:
X = data.iloc[:,1:]
y = data.iloc[:,0]
print(X)
print(y)

CHECKING THE CORRELATION BETWEEN THE INDEPENDENT FEATURES

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(X.corr(),annot = True)

CHECKING WHICH FEATURE IS IMPORTANT TO PREDICT THE OUTPUT

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X, y)
important_features = pd.Series(model.feature_importances_, index = X.columns)
print(important_features.nlargest(5))
important_features.nlargest(9).plot(kind ='bar')
plt.show

SPLITING THE DATA FOR TRAINING AND TESTING

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
X_train.shape,X_test.shape

MODEL BUILDING : RANDOM FOREST

In [None]:
#model creation
#MANNUALLY INITIALIZING THE PARAMETERS
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=300,max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
rmse_value = mean_squared_error(y_test, predictions, squared=False)
rmse_value


USING "RandomizedSearchCV" TO FIND THE BEST PARAMETERS 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100,5)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
rf=RandomForestRegressor()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,
                               random_state=10,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)


FINDIND THE BEST PARAMETER

In [None]:
rf_randomcv.best_params_


USING THE BEST ESTIMATOR TO PREDICT THE RESULTS

In [None]:
best_random_grid=rf_randomcv.best_estimator_
y_pred=best_random_grid.predict(X_test)
rmse_value = mean_squared_error(y_test, y_pred, squared=False)
rmse_value


In [None]:
sns.distplot(y_test-y_pred);


In [None]:
plt.scatter(y_test, y_pred)