# Importing the libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Importing the dataset and analyzing

In [None]:
df = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
final = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven','Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [None]:
final.head()

# Creating new feature

In [None]:
final['Current_Year'] = 2021
final.head()

In [None]:
final['Years_old'] = final['Current_Year']-final['Year']
final.head()

In [None]:
final.drop(['Year'],axis=1,inplace=True)

In [None]:
final.drop(['Current_Year'],axis=1,inplace=True)

In [None]:
final.head()

# One Hot encoding

In [None]:
final = pd.get_dummies(final,drop_first=True)

In [None]:
final.head()

In [None]:
final.shape

# Relationship among features

In [None]:
final.corr()

In [None]:
sns.pairplot(final)

In [None]:
corr = final.corr()
corr_features = corr.index
plt.figure(figsize=(10,10))
g = sns.heatmap(final[corr_features].corr(), annot=True)

In [None]:
x = final.corr().sort_values(by="Selling_Price", ascending=False)
print(x['Selling_Price'])

In [None]:
final.head()

In [None]:
X = final.iloc[:,1:]
y = final.iloc[:,0]

# Important features

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns) 
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

# Using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()

In [None]:
## Hyperparameters
import numpy as np
n_estimators = [int(x) for x in np.linspace(100,1200,12)]
print(n_estimators)

# For hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(100,1200,12)]
max_features = ['auto','sqrt']
max_depth =  [int(x) for x in np.linspace(5,30,6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

In [None]:
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

In [None]:
# First create the base model to tune
rf = RandomForestRegressor()

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', 
                               n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

# Making predictions

In [None]:
predictions = rf_random.predict(X_test)
predictions

In [None]:
sns.distplot(y_test-predictions)

#### We got the plot as a normalized data so our prediction is good!

In [None]:
plt.scatter(y_test,predictions)

In [None]:
import pickle
file = open('random_forest_regression_model.pkl', 'wb')
pickle.dump(rf_random, file)

#### Saved this model in a pickle file for future use

### Thank You!
### Do **upvote** if you find this useful