In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")
data.head()

In [None]:
print(f"This dataframe has {data.shape[0]} rows")
print(f"This dataframe has {data.shape[1]} columns")

In [None]:
# Trying to find the categorical features
print(data['Seller_Type'].unique())
print(data['Transmission'].unique())
print(data['Owner'].unique())
print(data['Fuel_Type'].unique())

In [None]:
# Check for missing values
data.isna().sum()

In [None]:
data.describe()

In [None]:
data['Age'] = 2020 - data['Year'] 
data.head()

In [None]:
data.columns

In [None]:
final_df = data[['Year', 'Age', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
final_df.sample(5)

In [None]:
final_df.drop(labels=['Year'], axis=1, inplace=True)
final_df.head()

In [None]:
final_df = pd.get_dummies(final_df, drop_first=True)
final_df.sample(3)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 12))
corr_matrix = final_df.corr()
cmap = sns.color_palette("crest", as_cmap=True)
sns.heatmap(corr_matrix, cmap=cmap, annot=True)

In [None]:
final_df.head()

In [None]:
#Getting the independent and dependent feature
final_df.columns

X = final_df[['Age', 'Present_Price', 'Kms_Driven', 'Owner',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Seller_Type_Individual',
       'Transmission_Manual']]

y = final_df['Selling_Price']

In [None]:
y.head()

In [None]:
## Feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X, y)

In [None]:
feat_imp = model.feature_importances_
colnames = X.columns

In [None]:
plt.bar(colnames, feat_imp)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#Hyperparameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

#Number of features to consider at every split
max_features =['auto', 'sqrt']

# Levels in tree
max_depth=[int(x) for x in np.linspace(5, 30, 6)]

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
from sklearn.model_selection import RandomizedSearchCV


In [None]:
# Create the random grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split":min_samples_split,
    "min_samples_leaf":min_samples_leaf
}

print(random_grid)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                               scoring='neg_mean_squared_error', 
                               n_iter = 10, 
                               cv = 5, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = 1)

rf_random.fit(X_train, y_train)

In [None]:
predictions = rf_random.predict(X_test)
predictions

In [None]:
sns.histplot(y_test - predictions, kde=True)

In [None]:
plt.scatter(y_test, predictions)