In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

# Loading Our Data

In [None]:
data = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')
data.head()

In [None]:
data.shape #Checking number of rows and cols

In [None]:
data.columns

In [None]:
data.describe().T # Gives you overview of the data

In [None]:
data.info() 

## Null checking

In [None]:
data.isna().sum()

No null values, so far so good..

# Separating numerical and categorical collumns

In [None]:
all_cols = data.columns
numerical_cols = data._get_numeric_data().columns.to_list()
categorical_cols = list(set(all_cols)- set(numerical_cols))

In [None]:
numerical_cols, categorical_cols

In [None]:
categorical_cols

In [None]:
[data[x].unique() for x in categorical_cols[1:] ] #excluding car name which is first element

fuel_type has three unique values: Petrol / Diesel / CNG  
Seller type : Dealer / Individual  
Transmission: Manual / Automatic

In [None]:
# we don't need car name so i am gonna drop it right away
data.drop(['Car_Name'], inplace=True, axis=1)

In [None]:
data.head()

Now, for the year column, what we need is the difference between current year (2020) and the year when car was manufactured. This will give us the number of years how old car is.

In [None]:
# car's age
data['Car_Age'] = 2020 - data.Year
data.head()

In [None]:
# Dropping year column, we dont need it now
data.drop(['Year'], inplace=True, axis=1)

# One Hot Encoding categorical columns

In [None]:
data = pd.get_dummies(data, drop_first=True)# Get dummies will return OHE columns
data.head()

In [None]:
data.shape

# Finding Correlation

In [None]:
data.corr()

In [None]:
import seaborn as sns
plt.figure(figsize=(18,8))
sns.heatmap(data.corr(),annot=True);

# Getting our data into X and y

In [None]:
y = data.pop('Selling_Price') #will pop the sel price collumn and drop it in y
X = data #remaining data

In [None]:
X.head()

In [None]:
y.head()

# Checking Feature importance

which features are important which are not

In [None]:
# Feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X, y)

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(x=data.columns, y=model.feature_importances_);

so as you can see, present price is the most important feature then comes diesel based cars so on and so fourth

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

# Model Building: Linear Regression

In [None]:
#Create the regressor: reg
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [None]:
#Fit the regressor to the training data
reg.fit(X_train, y_train)

In [None]:
# Predict on the test data: y_pred
y_pred = reg.predict(X_test)

In [None]:
# Compute and print RMSE
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

# Cross validation with linear regression

In [None]:
from sklearn.model_selection import cross_val_score

# Define the regression_model_cv function, which takes a fitted model as one parameter. The k = 5 hyperparameter gives the number of folds.
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean ())

In [None]:
regression_model_cv(LinearRegression())


In [None]:
#Use the regression_model_cv function on the LinearRegression() model with 3 folds and then 6 folds, as shown in the following code snippet, for 3 folds:
regression_model_cv(LinearRegression(), k=3)

In [None]:
# Now, test the values for 6 folds
regression_model_cv(LinearRegression(), k=6)

So with k=5 we get best results

# Regularization: Ridge and Lasso

In [None]:
#We begin by setting Ridge() as a parameter for regression_model_cv
from sklearn.linear_model import Ridge
regression_model_cv(Ridge())

In [None]:
# Now, set Lasso() as the parameter for regression_model_cv:
from sklearn.linear_model import Lasso
regression_model_cv(Lasso())

With ridge, we get slightly better result,great!

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

In [None]:
# Hyperparameters
n_estimators = [100,200,300,500]
max_features = ['auto', 'sqrt']
max_depth = [5, 10, 15, 20]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
grid = {'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth
}

In [None]:
clf_cv = RandomizedSearchCV(estimator=clf, param_distributions=grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42)

In [None]:
clf_cv.fit(X_train, y_train)

In [None]:
preds = clf_cv.predict(X_test)
preds

In [None]:
from sklearn.metrics import mean_squared_error
rmse_value = mean_squared_error(y_test, preds, squared=False)
rmse_value

In [None]:
sns.distplot(y_test-preds);

In [None]:
plt.scatter(y_test, preds);