# Car Prediction

*This notebook analysizes the car dataset from cardekho and predicts the price.*

In [None]:
import pandas as pd
import numpy as np 

In [None]:
df=pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.shape

In [None]:
print(df['Seller_Type'].unique())
print(df['Fuel_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

In [None]:
#check missing values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.drop(["Car_Name"],axis=1,inplace=True)

In [None]:
df['Current Year']=2021

In [None]:
df['no_year']=df['Current Year']- df['Year']

In [None]:
df.head()

In [None]:
df.drop(['Year'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df=pd.get_dummies(df,drop_first=True)

In [None]:
df.head()

In [None]:
df=df.drop(['Current Year'],axis=1)

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
import seaborn as sns
sns.pairplot(df)

In [None]:
#get correlations of each features in dataset
import matplotlib.pyplot as plt
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [None]:
X['Owner'].unique()

In [None]:
X.head()

In [None]:
y

In [None]:
#Feature Importance

from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#Randomized Search CV
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
#Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
#max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
rf = RandomForestRegressor()

In [None]:
#Random search of parameters, using 3 fold cross validation
#search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions=rf_random.predict(X_test)

In [None]:
#scatter plot
plt.scatter(y_test,predictions)

In [None]:
#displaying error measures
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))