In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Reading Dataset

In [None]:
data = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv')
print(data.shape)
data.head()

In [None]:
data.info()

In [None]:
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))
print('The categorical variables are :', categorical)

In [None]:
for var in categorical: 
    print(data[var].value_counts())
    print()

In [None]:
data.describe()

**Build a new column**

In [None]:
data['car_year'] = 2021-data['year']
data.head()

## 2. Data Visualization

#### Correlation Between Numerical Features and target price

In [None]:
df = data.copy()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(),square=True,annot=True)

In [None]:
corr = data.corr()
Num = corr['price'].sort_values(ascending=False).to_frame()
s = Num.style.background_gradient(cmap='BuGn')
s

In [None]:
sns.lmplot(x='engineSize',y='price',data=df)
plt.title('Engine-size vs Price')
plt.xlabel('Engine-size')
plt.ylabel('Price')

In [None]:
sns.lmplot(x='mpg',y='price',data=df)
plt.title('MPG vs Price')
plt.xlabel('MPG')
plt.ylabel('Price')

#### Univariate Distribution and Bivariate Distribution

In [None]:
sns.displot(df['price'])

In [None]:
sns.jointplot(x=df['engineSize'],y=df['mpg'],hue=df['transmission'])

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x='car_year',y='price',data=df)

#### Visualization for Categorical Variables

In [None]:
plt.title('transmission')
sns.countplot(x=df['transmission'],palette='Reds')

In [None]:
plt.title('FuelType')
sns.countplot(x=df['fuelType'],palette='Greens')

In [None]:
fig = plt.figure(figsize=(25,6))
sns.barplot(x=df.model,y=df.price)
plt.title('Price Vs Model')
plt.ylabel('Prices of Cars',fontweight='bold')
plt.xlabel('Models',fontweight='bold')

## 3. Data Preprocessing

#### Handling Outliers

In [None]:
numercial = [var for var in data.columns if data[var].dtype!='O']
print('There are {} categorical variables'.format(len(numercial)))
print('The categorical variables are :', numercial)

In [None]:
plt.figure(figsize=(12,8))
plt.title("Numerical Variables in Audi Dataset")
data[numercial].boxplot(color='red')
plt.show()

In [None]:
data[data['mileage'] >= 150000]

In [None]:
i = data[((data.mileage >= 150000))].index
data = data.drop(i)

#### Dummy Variable Encoding

In [None]:
print(data['transmission'].unique())
print(data['fuelType'].unique())

In [None]:
data = pd.get_dummies(data,columns=['transmission','fuelType'])
data.head()

#### Ordinal Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
data['model_code'] = encoder.fit_transform(data[['model']])
data[['model','model_code']].head(10)

#### Selecting Necessary Columns for Modeling

In [None]:
data.info()

In [None]:
select = data.iloc[:,2:]
select.head()

## 4. Building Model

#### Spliting Training and Testing datasets

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
std_select = scaler.fit_transform(select)
std_select = pd.DataFrame(std_select,columns=select.columns)
std_select.head()

In [None]:
features = std_select.drop('price',axis=1)
target = std_select[['price']]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Baseline Models and Scores

In [None]:
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

def fit_and_score(models, X_train, X_test, y_train, y_test):
    
    np.random.seed(0)
    
    model_mse = {}
    model_mape = {}
    model_r2 = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_preds = model.predict(X_test)
        model_mse[name] = mean_squared_error(y_test,y_preds)
        model_mape[name] = np.mean(np.abs((np.array(y_test) - np.array(y_preds)) / np.array(y_test))) *100
        model_r2[name] = r2_score(y_test,y_preds)
    
    model_mse = pd.DataFrame(model_mse, index=['MSE']).transpose()
    model_mse = model_mse.sort_values('MSE',ascending=False)
    
    model_mape = pd.DataFrame(model_mape, index=['MAPE']).transpose()
    model_mape = model_mape.sort_values('MAPE',ascending=False)
    
    model_r2 = pd.DataFrame(model_r2, index=['R2']).transpose()
    model_r2 = model_r2.sort_values('R2')
        
    return model_mse,model_mape,model_r2

In [None]:
models = {'LinearRegression' : LinearRegression(),
          'KNeighborsRegressor': KNeighborsRegressor(),
          'DecisionTreeRegressor': DecisionTreeRegressor(),
          'RandomForestRegressor':RandomForestRegressor(),
          'GradientBoostingRegressor': GradientBoostingRegressor(),
          'XGBRegressor': XGBRegressor(objective='reg:squarederror')}

In [None]:
model_mse,model_mape,model_r2 = fit_and_score(models,X_train,X_test,y_train,y_test)

In [None]:
model_mse

In [None]:
model_mape

In [None]:
model_r2

#### Hyperparameter Tuning via Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
def gridsearch_cv_scores(models, params, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_gs_scores = {}
    model_gs_best_param = {}
    
    for name, model in models.items():
        gs_model = GridSearchCV(model,
                                param_grid=params[name],
                                cv=5,
                                verbose=0)
        
        gs_model.fit(X_train,y_train)

        model_gs_scores[name] = gs_model.score(X_test,y_test)
        model_gs_best_param[name] = gs_model.best_params_

    model_gs_scores = pd.DataFrame(model_gs_scores, index=['Accuracy'])
    model_gs_scores = model_gs_scores.transpose().sort_values('Accuracy')
        
    return model_gs_scores, model_gs_best_param

In [None]:
models = {'RandomForestRegressor':RandomForestRegressor()}

params = {'RandomForestRegressor': {'n_estimators' : [150,200,300],
                    'criterion' : ['mse'],
                    'oob_score' : [False]}}

In [None]:
model_gs_scores_1, model_gs_best_param_1 = gridsearch_cv_scores(models,params,X_train,X_test,y_train,y_test)

In [None]:
model_gs_scores_1

In [None]:
model_gs_best_param_1

## 5.Model Evaluation

Since we have done a gird search CV. it's time to build the model for evalution using the full dataset

#### RandomForest Regressor

In [None]:
model = RandomForestRegressor(criterion='mse',n_estimators=300,oob_score=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
def rmse(actual, pred):
    return np.sqrt(mean_squared_error(actual, pred))

def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [None]:
r2 = r2_score(y_test,y_pred)
rmse = mean_squared_error(y_test,y_pred)
mape = mape(y_test,y_pred)

In [None]:
print(f'R2 Score: {r2}')
print(f'Root Mean Square Error: {rmse}')
print(f'Mean Absolute Percentage Error: {mape}')

#### Visualization

In [None]:
sns.regplot(x=y_test,y=y_pred)
plt.title('Predict vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predict')

#### Feature Importance

In [None]:
feat_importance = model.feature_importances_
feat_importance = pd.DataFrame(feat_importance,columns=['Score'],index=features.columns)

In [None]:
feat_importance.sort_values(by='Score',ascending=False).style.background_gradient(cmap='Reds')

In [None]:
plt.figure(figsize=(10,6))
plt.title('Feature Importances')
sns.barplot(x=feat_importance.Score,y=feat_importance.index)

**Conclusion:**

I got maximum r2 score of 0.961 on **RandomForest regressor model**.