In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv')

# Some exploratory analysis

In [None]:
df.head()

In [None]:
df.info()

In [None]:
plt.title('Number of cars based on type of Fuel');
sns.countplot(data=df, x='fuel');

In [None]:
plt.title('Number of cars based on type of Seller');
sns.countplot(data=df, x='seller_type');

In [None]:
plt.title('Number of cars based on type of Transmission');
sns.countplot(data=df, x='transmission');

In [None]:
plt.title('Number of cars based on type of Owners');
sns.countplot(data=df, x='owner');
plt.xticks(rotation=90);

In [None]:
#removing name from dataset since it's a unique identifier for each row
df = df.drop('name',axis=1)

In [None]:
#changing categorical to dummy variables
df1 = pd.get_dummies(data=df, drop_first=True)

In [None]:
df1

In [None]:
#checking the correlation with the label
df1.corr()['selling_price']

In [None]:
plt.figure(figsize=(12,8),dpi=150);
plt.title('Correlation between different variables')
sns.heatmap(data=df1.corr(),annot=True);

In [None]:
sns.scatterplot(data=df1, x='year',y='selling_price');

In [None]:
#turning year to age
df1['year'] = df1['year'].apply(lambda year: 2021 - year)

In [None]:
sns.scatterplot(data=df1, x='year',y='selling_price');

In [None]:
sns.displot(data=df1, x='selling_price');
plt.xlim(0,500000)

In [None]:
sns.scatterplot(data=df, x='km_driven', y='selling_price',alpha=0.5)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df1.drop('selling_price',1)

In [None]:
y = df1['selling_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)

In [None]:
scaled_X_test = scaler.transform(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# compare different models

In [None]:
def report_model(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    print(mae)
    print(np.sqrt(mse))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
report_model(lr,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge = Ridge()

In [None]:
report_model(ridge,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso = Lasso()

In [None]:
report_model(lasso,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
ridgecv = RidgeCV()

In [None]:
report_model(ridgecv,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtr = DecisionTreeRegressor()

In [None]:
report_model(dtr,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

In [None]:
rfr = RandomForestRegressor()

In [None]:
report_model(rfr,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
abc = AdaBoostRegressor()

In [None]:
report_model(abc,scaled_X_train,y_train,scaled_X_test,y_test)

In [None]:
# Choosing RandomForest as the model
# Using GridSearchCV to find the best hyperparameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params = {
            "n_estimators": np.linspace(100, 1200, 12).astype(int),
            "max_depth": [5, 7, 9, 10, 15, 20, 25, 30, 35, 40],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)]}

In [None]:
model = RandomizedSearchCV(rfr,params)

In [None]:
model.fit(scaled_X_train,y_train)

In [None]:
model.best_params_

In [None]:
y_pred =model.predict(scaled_X_test)

In [None]:
mean_absolute_error(y_test,y_pred)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
plt.figure('Residuals plot')
sns.residplot(x=y_pred, y=y_test);

# Finalizing the model

In [None]:
final_model = RandomForestRegressor(n_estimators=300, min_samples_split=10, min_samples_leaf=3, max_depth=15)

In [None]:
final_model.fit(X,y)

In [None]:
importance = pd.Series(final_model.feature_importances_,index=X.columns).sort_values(ascending=False)

In [None]:
importance.plot(kind='bar',xlabel='Features',ylabel='Coefficient',title='Importance of each feature',colormap='Accent');
;

In [None]:
#saving the model
import pickle

file = open('car_pred.pkl', 'wb')

pickle.dump(final_model, file)