In [None]:
#libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#importing dataset
dataset=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df=dataset.copy()
df.head()

***
## **Data Analysis**
***

In [None]:
#shape of the dataset
df.shape

In [None]:
#catgegories in categorial features
for feature in ['Fuel_Type', 'Seller_Type', 'Transmission']:
    print(feature, df[feature].unique())

In [None]:
#missing values
df.isnull().sum()

***
## **Feature Engineering**
***

In [None]:
#converting the 'year' to number of years
from datetime import date
df['Year'] = date.today().year - df['Year']

In [None]:
#dropping 'Car_Name'
df.drop('Car_Name', axis=1, inplace=True)

In [None]:
#performing one hot encoding on categorical features
df=pd.get_dummies(data=df, columns=['Fuel_Type', 'Seller_Type', 'Transmission'], drop_first=True)
df.head()

In [None]:
#correlation
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(), vmax=.8, square=True, cmap='BuPu', annot=True)

In [None]:
#outliers
for feature in ['Present_Price', 'Kms_Driven']:
    df.boxplot(column=feature)
    plt.show()

In [None]:
#handling outliers
#since the features are not normally distributed I'll use IQR to handle outliers

#Present_Price
IQR = df.Present_Price.quantile(0.75) - df.Present_Price.quantile(0.25)
Lower_range = df.Present_Price.quantile(0.25) - (1.5*IQR)
Upper_range = df.Present_Price.quantile(0.75) + (1.5*IQR)

df['Present_Price'] = np.where(df['Present_Price']>22, 22, df['Present_Price'])

#Kms_Driven
IQR = df.Kms_Driven.quantile(0.75) - df.Kms_Driven.quantile(0.25)
Lower_range = df.Kms_Driven.quantile(0.25) - (1.5*IQR)
Upper_range = df.Kms_Driven.quantile(0.75) + (1.5*IQR)

df['Kms_Driven'] = np.where(df['Kms_Driven']>99417, 99417, df['Kms_Driven'])

In [None]:
#checking outliers again
for feature in ['Present_Price', 'Kms_Driven']:
    df.boxplot(column=feature)
    plt.show()

***
## **Feature Transformation**
***

In [None]:
#Q-Q plot to check the distribution
import scipy.stats as stat
import pylab

def plot_data(df, feature):
    plt.figure(figsize=(15,6))
    plt.subplot(1,3,1)
    sns.distplot(df[feature])
    plt.title(feature)
    plt.subplot(1,3,2)
    stat.probplot(df[feature], dist='norm', plot=pylab)
    plt.show()
    print(df[feature].skew())

In [None]:
#plotting distribution
plot_data(df, 'Kms_Driven')

***
## **Feature Scaling**
***

In [None]:
#minmaxscaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

features_to_scale=[feature for feature in df.columns if feature not in ['Selling_Price']]
scaler.fit(df[features_to_scale])

df = pd.concat([df[['Selling_Price']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[features_to_scale]), columns=features_to_scale)],
                    axis=1)

***
## **Model Creation**
***

In [None]:
#splitting dataset
from sklearn.model_selection import train_test_split

X=df.drop('Selling_Price', axis=1)
y=df['Selling_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [None]:
#model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

reg = RandomForestRegressor()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [None]:
#score
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))