 # **CAR PRICE PREDICTION**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
imputed_df = pd.read_csv('/kaggle/input/used-cars-dataset-from-cardekhocom/cardekho_imputated.csv')
updated_df = pd.read_csv('/kaggle/input/used-cars-dataset-from-cardekhocom/cardekho_updated.csv')

In [None]:
imputed_df.head()

In [None]:
updated_df.head()

In [None]:
imputed_df.info()

In [None]:
updated_df.info()

In [None]:
updated_df.isnull().sum()

In [None]:
updated_df['km_driven'] = imputed_df['km_driven']
updated_df['new_price'] = (imputed_df['min_cost_price']+imputed_df['max_cost_price'])/2
updated_df['new_price'] = round(updated_df['new_price'],2)
updated_df['new_price'] = updated_df['new_price']/100000
updated_df['year'] = 2021 - updated_df['year']
updated_df['km_driven'] = updated_df['km_driven']/10000

In [None]:
tar = updated_df['new_price'].copy()
New_price = tar

In [None]:
updated_df = updated_df.drop(['new_price'], axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

**Identify input and target columns**

In [None]:
input_cols, target_col = updated_df.columns[1:-1], New_price
inputs_df, targets = updated_df[input_cols].copy(), target_col.copy()

In [None]:
targets = targets.replace(np.nan, 0)

**Identify numeric and categorical columns**

In [None]:
numeric_cols = updated_df[input_cols].select_dtypes(include=np.number).columns.tolist()
categorical_cols = updated_df[input_cols].select_dtypes(include='object').columns.tolist()

**Impute and scale numeric columns**

In [None]:
imputer = SimpleImputer().fit(inputs_df[numeric_cols])
inputs_df[numeric_cols] = imputer.transform(inputs_df[numeric_cols])
scaler = MinMaxScaler().fit(inputs_df[numeric_cols])
inputs_df[numeric_cols] = scaler.transform(inputs_df[numeric_cols])

In [None]:
inputs_df

**One-hot encode categorical columns**

In [None]:
inputs_df[[
 'mileage',
 'engine',
 'max_power']] = inputs_df[[
 'mileage',
 'engine',
 'max_power']].replace(np.nan, 'Unknown')

In [None]:
inputs_df[categorical_cols] = inputs_df[categorical_cols].astype(str)

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(inputs_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
inputs_df[encoded_cols] = encoder.transform(inputs_df[categorical_cols])

**Create training and validation sets**

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs_df[numeric_cols + encoded_cols], targets, test_size=0.25, random_state=42)

## **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor(random_state=42)

In [None]:
tree.fit(train_inputs, train_targets)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
tree_train_preds = tree.predict(train_inputs)

In [None]:
tree_train_rmse = mean_squared_error(train_targets, tree_train_preds, squared=False)

In [None]:
tree_val_preds = tree.predict(val_inputs)

In [None]:
tree_val_rmse = mean_squared_error(val_targets, tree_val_preds, squared=False)

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(tree_train_rmse, tree_val_rmse))

In [None]:
from sklearn.tree import plot_tree, export_text
sns.set_style('darkgrid')

***Visualize the tree graphically using plot_tree***

In [None]:
plt.figure(figsize=(30, 15))
plot_tree(tree, feature_names=train_inputs.columns, max_depth=3, filled=True);

***Visualize the tree textually using export_text***

In [None]:
tree_text = export_text(tree, max_depth=10, feature_names=list(train_inputs.columns))

In [None]:
# Display the first few lines
print(tree_text[:2000])

**Check feature importance**

In [None]:
tree_importances = tree.feature_importances_

In [None]:
tree_importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': tree_importances
}).sort_values('importance', ascending=False)

In [None]:
tree_importance_df

In [None]:
plt.title('Decision Tree Feature Importance')
sns.barplot(data=tree_importance_df.head(10), x='importance', y='feature');

In [None]:
from sklearn.ensemble import RandomForestRegressor

*Create the model*

In [None]:
rf1 = RandomForestRegressor(n_jobs=-1, random_state=42)

In [None]:
# Fit the model
rf1.fit(train_inputs, train_targets)

In [None]:
rf1_train_preds = rf1.predict(train_inputs)

In [None]:
rf1_train_rmse = mean_squared_error(train_targets, rf1_train_preds, squared=False)

In [None]:
rf1_val_preds = rf1.predict(val_inputs)

In [None]:
rf1_val_rmse = mean_squared_error(val_targets, rf1_val_preds, squared=False)

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(rf1_train_rmse, rf1_val_rmse))

# **Training the Best Model**

In [None]:
# Create the model with custom hyperparameters
rf2 = RandomForestRegressor(n_jobs=-1,
                            random_state=42,
                            n_estimators=500,
                            max_features=7)

In [None]:
# Train the model
rf2.fit(train_inputs, train_targets)

In [None]:
rf2.score(train_inputs, train_targets), rf2.score(val_inputs, val_targets)

In [None]:
rf2_train_preds = rf2.predict(train_inputs)

In [None]:
rf2_train_rmse = mean_squared_error(train_targets, rf2_train_preds, squared=False)

In [None]:
rf2_val_preds = rf2.predict(val_inputs)

In [None]:
rf2_val_rmse = mean_squared_error(val_targets, rf2_val_preds, squared=False)

In [None]:
print('Train RMSE: {}, Validation RMSE: {}'.format(rf2_train_rmse, rf2_val_rmse))

In [None]:
rf2_importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': rf2.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
rf2_importance_df

In [None]:
sns.barplot(data=rf2_importance_df, x='importance', y='feature');
