In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the dataset
df=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df.shape, df.columns

In [None]:
#Filter Dataset and add column with Current year
final_dataset=df[['Year','Selling_Price','Present_Price','Kms_Driven','Fuel_Type','Seller_Type','Transmission','Owner']]
final_dataset['Current Year']=2021
final_dataset.head()

In [None]:
# Add new column with Age of vehicle
final_dataset['Age']=final_dataset['Current Year']- final_dataset['Year']

In [None]:
# Convert categorical columns with get_dummies
final_dataset=pd.get_dummies(final_dataset,drop_first=True)

In [None]:
#get correlations of each features in dataset
import matplotlib.pyplot as plt
import seaborn as sns

corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
# Set up x and y(target) data
y=final_dataset['Selling_Price']
x= final_dataset[['Present_Price', 'Kms_Driven', 'Owner', 'Age','Fuel_Type_Diesel', 
                 'Fuel_Type_Petrol', 'Seller_Type_Individual','Transmission_Manual']]

In [None]:
# Split the dataset with train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
# Define model
from sklearn.ensemble import GradientBoostingRegressor
model=GradientBoostingRegressor()

In [None]:
#Params to tune
'''
'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 
'n_estimators':[100,250,500,750,1000,1250,1500,1750]
'max_depth':[2,3,4,5,6,7] 
'min_samples_split':[2,4,6,8,10,20,40,60,100], 
'min_samples_leaf':[1,3,5,7,9]
'max_features':[2,3,4,5,6,7]
'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]
'''

In [None]:
x.columns

In [None]:
# tune Params with GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

params={'learning_rate':[0.1], 
        'n_estimators':[100],
        'max_depth':[6], 
        'min_samples_split':[20], 
        'min_samples_leaf':[1],
        'max_features':[7],
        'subsample':[0.9]
        
}
grid_srch = GridSearchCV(estimator=model, param_grid=params,
                              cv=10, 
                              scoring='neg_mean_squared_error',
                              n_jobs=-1, verbose=2)
grid_srch.fit(X_train,y_train)

In [None]:
# Check best params
grid_srch.best_params_, grid_srch.best_score_

In [None]:
# Fit the model with best params
gb = GradientBoostingRegressor(learning_rate =  0.01,max_depth =  6,max_features = 7, min_samples_leaf = 1,
                               min_samples_split = 20,n_estimators = 1000,subsample = 0.9)
gb.fit(X_train,y_train)

In [None]:
# make Predictions
predictions=gb.predict(X_test)

In [None]:
#Determine Accuracy
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))