In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df['Year'] = 2021 - df['Year']

In [None]:
df.head(-5)

In [None]:
pd.get_dummies(df['Seller_Type'])

In [None]:
feature_metadata = {'seller_type_metadata':seller_type_metadata,
                    'fuel_type_metadata' : fuel_type_metadata,
                   'transmission_metadata' : transmission_metadata}

In [None]:
metadata_file = open('feature_metadata.pickle','wb')
pickle.dump(feature_metadata,metadata_file)

In [None]:
seller_type_metadata = {'Dealer' : 0, 'Individual' : 1}

In [None]:
df['Seller_Type'] = df['Seller_Type'].map(seller_type_metadata)

In [None]:
pd.get_dummies(df['Fuel_Type'])

In [None]:
fuel_type_metadata = {'Petrol' : 2, 'Diesel' : 1, 'CNG' : 0}

In [None]:
df['Fuel_Type'] = df['Fuel_Type'].map(fuel_type_metadata)

In [None]:
df.head()

In [None]:
pd.get_dummies(df['Transmission'])

In [None]:
transmission_metadata = {'Automatic' : 1, 'Manual' : 0}

In [None]:
df['Transmission'] = df['Transmission'].map(transmission_metadata)

In [None]:
final_df = df.drop('Car_Name',axis=1)

In [None]:
final_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_distribution(df,feature):
    plt.figure(figsize=[10,8])
    plt.title(feature)
    sns.distplot(df[feature])

In [None]:
for feature in final_df.columns:
    plot_distribution(final_df,feature)

In [None]:
def plot_box(df,feature):
    plt.figure(figsize=[10,8])
    plt.title(feature)
    sns.boxplot(df[feature])

In [None]:
for feature in final_df.columns:
    plot_box(final_df,feature)

In [None]:
plt.figure(figsize=[30,30])
sns.pairplot(final_df)

In [None]:
x = final_df.drop('Selling_Price',axis=1)
y = final_df['Selling_Price']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [None]:
x_train_pca_components = pca.fit_transform(x_train)

In [None]:
x_train_pca_components

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]

In [None]:
max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(5, 50, num = 10)]

min_samples_split = [2, 5, 10, 15, 25, 50, 75,100]

min_samples_leaf = [1, 2, 5, 7,10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 20, verbose=1,cv = 5, random_state=42, n_jobs = -1)

### Using PCA Component Analysis

In [None]:
rf_random.fit(x_train_pca_components,y_train)

In [None]:
rf_random.best_score_

In [None]:
rf_random.best_params_

In [None]:
x_test_pca_components = pca.transform(x_test)

In [None]:
predictions = rf_random.predict(x_test_pca_components)

In [None]:
plt.figure(figsize=[10,8])
sns.distplot(y_test-predictions)
plt.title('Comparision')

In [None]:
plt.figure(figsize=[10,8])
plt.scatter(y_test,predictions)
plt.title('Comparision')

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

### Using Corealtion Matrix

In [None]:
plt.figure(figsize=[20,20])
sns.heatmap(final_df.corr(),annot=True,cmap="RdYlGn")

In [None]:
regressor_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 20, verbose=1,cv = 5, random_state=42, n_jobs = -1)

In [None]:
regressor_random.fit(x_train,y_train)

In [None]:
regressor_random.best_score_

In [None]:
regressor_random.best_params_

In [None]:
predictions = regressor_random.predict(x_test)

In [None]:
plt.figure(figsize=[10,8])
sns.distplot(y_test-predictions)
plt.title('Comparision')
plt.savefig('distribution.png')

In [None]:
plt.figure(figsize=[10,8])
plt.scatter(y_test,predictions)
plt.title('Comparision')
plt.savefig('scatter.png')

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
import pickle

In [None]:
model_file = open('model.pickle','wb')
pickle.dump(regressor_random,model_file)