## DataSet
https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho

## Use Case: Determine or predict what the selling price of the car will be

In [None]:
import pandas as pd

In [None]:
df =pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(df["Seller_Type"].unique())
print(df["Transmission"].unique())
print(df["Owner"].unique())
print(df["Fuel_Type"].unique())

In [None]:
##check missing or null values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
final_dataset = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [None]:
final_dataset.head()

In [None]:
final_dataset["Current_Year"]=2021

In [None]:
final_dataset.head()

In [None]:
final_dataset["no_year"]=final_dataset["Current_Year"] - final_dataset["Year"]

In [None]:
final_dataset.head()

In [None]:
final_dataset.drop(['Year'],axis=1,inplace=True)

In [None]:
final_dataset.head()

In [None]:
final_dataset.drop(["Current_Year"],axis=1,inplace=True)

In [None]:
final_dataset.head()

In [None]:
#Now we will be converting categorical features

In [None]:
#.get_dummies is used to convert categorical to one hot encoding 
final_dataset=pd.get_dummies(final_dataset,drop_first=True)
#drop_first is used to avoid the first column to get into 
#one hot encoding trap


In [None]:
final_dataset.head()

In [None]:
final_dataset.corr()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.pairplot(final_dataset)

In [None]:
#pair in form of heatmap

In [None]:
corrmat = final_dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g = sns.heatmap(final_dataset[top_corr_features].corr()
,annot=True,cmap="RdYlGn")

In [None]:
X =final_dataset.iloc[:,1:] 
y = final_dataset.iloc[:,0]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
## feature importance
from sklearn.ensemble import ExtraTreesRegressor
#to know ehich are important features in this

model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importance for better visualisation
feat_importances = pd.Series(model.feature_importances_,index=X.columns)
feat_importances.nlargest(5).plot(kind="barh")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
#implementing Random Forest Regressor
# and we wont be scaling coz random forest uses decision tree


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_random = RandomForestRegressor()


In [None]:
##Hypeparameters
import numpy as np
n_estimators = [int(x) for x in np.linspace(start =100,stop=1200,
                                           num=12)]
print(n_estimators)

In [None]:
#randomized Search CV

#Number of trees in Random Forest
n_estimators =[int(x) for x in np.linspace(start=100,stop=1200,
                                          num=12)]
#Number of features to consider at every split
max_features =['auto','sqrt']

#maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,30,num=6)]

#maximum_depth.append(None)
#minimum number of samples requires to split a node
min_samples_split = [2,5,10,15,100]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,5,10]


In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Create the random grid
random_grid ={'n_estimators':n_estimators,
             'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf}

print(random_grid)

In [None]:
#use the random grid to search the beest hyperparameters
#first create the base model to tune
rf =RandomForestRegressor()

In [None]:
rf_random = RandomizedSearchCV(estimator=rf,param_distributions=
                              random_grid,
                               scoring="neg_mean_squared_error",
                              n_iter =10,cv =5,verbose =2,
                              random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
predictions = rf_random.predict(X_test)

In [None]:
predictions

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
import pickle
#open a file where you want to store the data
file =open('random_forest_regression_model.pkl','wb')

#dump information to that file
pickle.dump(rf_random,file)