In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [50]:
data = pd.read_csv('../input/car-pdediction-data/car_prediction_dataCode (2)/car_prediction_data.csv')
data.head()

In [51]:
data.shape

In [52]:
data.describe()

In [53]:
#null  values 
data.isnull().sum()

In [54]:
print(data['Fuel_Type'].value_counts())
sns.countplot(data['Fuel_Type'])

In [55]:
print(data['Transmission'].value_counts())
sns.countplot(data['Transmission'])
plt.title("types of transmission")

In [56]:
print(data['Car_Name'].value_counts())
sns.countplot(data['Car_Name'])
plt.title("car model names")

In [57]:
#we have a large number of car names here so if we wish to drop that column,its upto u because i dont think so it is going to have that much impact
data.drop(['Car_Name'], axis=1,inplace =True)

In [58]:
data["current_year"] = 2022
#data.head()

In [59]:
#check difference between current year and car model year
data['year_difference'] = data["current_year"] - data["Year"]
data.head()

In [60]:
data['price_difference'] = data['Present_Price'] - data['Selling_Price']
data.head()

In [61]:
#dealing with categorical dataset
data =pd.get_dummies(data, drop_first =True)
data.head()

In [62]:
data.corr()

In [63]:
#sns.pairplot(data)

In [64]:
#sns.heatmap(data)

In [65]:
#dividing data into x and y

x= data.iloc[: , 2:]
y= data.iloc[:,1]
print(x.shape)
y.head()

In [66]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
y_test.head()

**now applying regressor models **

In [67]:
from sklearn.ensemble import ExtraTreesRegressor
model1 = ExtraTreesRegressor()
model1.fit(x_train,y_train)
score = model1.score(x_test,y_test)
print('score', score)
predict = model1.predict(x_test)

predict

In [68]:
#feature importance 
feature_importance = model1.feature_importances_
print(feature_importance)

In [69]:
feat_importance= pd.Series(feature_importance , index = x_train.columns)
feat_importance.nlargest(5).plot(kind='bar')

In [70]:
###### random forest regressor

In [71]:
from sklearn.ensemble import RandomForestRegressor
model2 =RandomForestRegressor()
model2.fit(x_train,y_train)
score = model2.score(x_test,y_test)
print(score)
predict = model2.predict(x_test)
predict

In [72]:
#validation
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(100,1200,12)]
max_features = ['auto' , 'sqrt']
max_depth = [int(x) for x in np.linspace(5,30,6)]
min_samples_split =[2,5,10,15,100]
min_samples_leaf =[1,2,5,10]

In [73]:
random_grid = {'n_estimators' : n_estimators , 
              'max_features': max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split, 
              'min_samples_leaf' : min_samples_leaf }
rscv = RandomizedSearchCV(estimator = model2, param_distributions = random_grid, scoring = 'neg_mean_squared_error' , n_iter =10,cv=5, verbose =2, random_state =42,n_jobs =1)
rscv.fit(x_train,y_train)
print(random_grid)
rscv.fit(x_train,y_train)
rscv.best_params_

In [74]:
print(rscv.best_score_)
print(rscv.best_params_)

In [75]:
predict = rscv.predict(x_test)
sns.distplot(y_test-predict)
plt.show()
plt.scatter(y_test,predict)
plt.show()

In [76]:
from sklearn import metrics
print('MAE',metrics.mean_absolute_error(y_test,predict))
print('MSE',metrics.mean_squared_error(y_test,predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,predict)))

xgboost regressor

In [77]:
#xgboost classifier 
import xgboost as xgb
from scipy.stats import uniform, randint
model3 = xgb.XGBRegressor(objective = 'reg:linear',random_state =42)

params = {"gamma": uniform(0,0.5), 
         'learning_rate': uniform(0.03,0.3),
         'max_depth': randint(2,6),
         'n_estimators' : randint(100,150),
         "subsample": uniform(0.6,0.4)
         }

In [78]:
rscv2 = RandomizedSearchCV(estimator = model3, param_distributions = params, scoring = 'neg_mean_squared_error' , n_iter =10,cv=5, verbose =2, random_state =42,n_jobs =1)

In [79]:
rscv2.fit(x_train,y_train)

In [80]:
print(rscv2.best_score_)

In [81]:
rscv2.best_params_

In [82]:
predict = rscv2.predict(x_test)
score = rscv2.score(x_test,y_test)
score

In [83]:
sns.distplot(y_test-predict)
plt.show()
plt.scatter(y_test,predict)
plt.show()

In [84]:
print('MAE',metrics.mean_absolute_error(y_test,predict))
print('MSE',metrics.mean_squared_error(y_test,predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,predict)))

**catboost regressor**

In [85]:
import catboost as cat

model4 = cat.CatBoostRegressor()

params = { 
        'learning_rate': [0.03,0.1],
         'depth': [4,6,10],
         'l2_leaf_reg': [1,3,5,7]
       }

In [86]:
rscv3 = RandomizedSearchCV(estimator = model4, param_distributions = params, scoring = 'neg_mean_squared_error' , n_iter = 4,cv=4, verbose =2, random_state =42,n_jobs =1)

In [87]:
!pip install catboost

In [88]:
rscv3.fit(x_train,y_train)

In [89]:
rscv3.best_params_

In [90]:
rscv.best_score_

In [91]:
predict = rscv3.predict(x_test)
sns.distplot(y_test-predict)
plt.show()
plt.scatter(y_test,predict)
plt.show()

In [92]:
print('MAE',metrics.mean_absolute_error(y_test,predict))
print('MSE',metrics.mean_squared_error(y_test,predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,predict)))