In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the Dataset 
data = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
data.head()

In [None]:
# finding if there is any null values
data.isnull().any()

In [None]:
# analysing the different values in different columns
print(data["Seller_Type"].unique())
print(data["Fuel_Type"].unique())
print(data["Seller_Type"].unique())
print(data["Owner"].unique())

In [None]:
# analysing the data's
data.describe()

In [None]:
# correlation 
data.corr()

In [None]:
# plotting fuel type and selling price
plt.scatter(data["Fuel_Type"], data["Selling_Price"])

In [None]:
# finding the number of unique cars
data["Car_Name"].nunique()

In [None]:
# finding the age of car (present year - buying year)
data["No_Year"] = 2020 - data["Year"]

In [None]:
# dropping unwanted cells
data.drop(['Car_Name', 'Year'], axis = 1, inplace = True)

In [None]:
# pairplot
sns.pairplot(data)

In [None]:
# correlation plot

plt.figure(figsize=(20,20))

sns.heatmap(data.corr(), annot=True)

In [None]:
# output(y) and input(x)
y = data["Selling_Price"]
data.drop(["Selling_Price"], axis = 1, inplace = True)
x = data

In [None]:
# handling categorical variables
x = pd.get_dummies(x, drop_first = True)

In [None]:
x.head()

In [None]:
# train test spliting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [None]:
# importing essential algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
# creating model dictionary for gridsearch
model_param = {

    'random_forest':{
        'model': RandomForestRegressor(),
        'params':{
            'n_estimators':[100,300,500,700]
        }
    },
    'decision_Tree':{
        'model':DecisionTreeRegressor(),
        'params':{
            'max_depth':[5,10,20]
        }
    }
    
}

In [None]:
model_param

In [None]:
# grid search crossvalidation

scores = []

for mn, mp in model_param.items():
    model = GridSearchCV(mp['model'], mp['params'],cv = 5, return_train_score = False, verbose = 0)
    model.fit(X_train,y_train)
    scores.append({
        'model':mp['model'],
        'best_score':model.best_score_,
        'best_params':model.best_params_
    })

In [None]:
# best parameters after performing cross validation
best_param = pd.DataFrame(scores)
best_param

In [None]:
# creating our model
model = RandomForestRegressor(n_estimators = 300)

In [None]:
#training the model
model.fit(X_train, y_train)

In [None]:
#prediction of model
y_pred = model.predict(X_test)

In [None]:
# analysing the ouput
sns.distplot(y_test-y_pred)

In [None]:
plt.scatter(y_test, y_pred)

In [None]:
# calculating the RMSE
from sklearn import metrics

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))