# Approach


1)Each column of the datasets were checked for any data inconsistency.

2)Required actions were taken for specific columns where data inconsistencies were found.

3)Different regression algorithms were used to build different models.

4) __RandomForestRegressor__ gave us the best model.So the .ipynb file contains only the random forest models.

5)Tuning of hyperparameters were required for random forest regressor to optimize the RMSLE value.

6)Best features were selected using VIF,RFE,forward elimnation,backward elimination,random forest and extra trees techniques. Features extracted using extra trees technique gave us the best model.

# Importing Librabries

In [None]:
# suppress display of warnings
import warnings
warnings.filterwarnings("ignore")

# 'Pandas' is used for data manipulation and analysis
import pandas as pd 

# 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy
import matplotlib.pyplot as plt

# 'Seaborn' is based on matplotlib; used for plotting statistical graphics
import seaborn as sns

# import various functions to perform regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor

#importing metrics for tabulating the result
from sklearn.metrics import mean_squared_log_error

#setting the plot size using rcParams
plt.rcParams['figure.figsize'] = [15,8]

Importing the data

In [None]:
df = pd.read_csv('../input/car-prices-dataset/train.csv')
df_test = pd.read_csv('../input/car-prices-dataset/test.csv')

# Understanding the data

In [None]:
df.head()

From the above display we can see that:

1)The 'Levy' column contains '-' symbol.We need to look into this column.

2)In the 'Doors' column there are month names which we need to remove.

3)In the 'MIleage' column there is 'km' written, we need to seperate this 'km' for model building purpose.

In [None]:
df_test.head()

From the above display we can see that:

1)The 'Levy' column contains '-' symbol.We need to look into this column.

2)In the 'Doors' column there are month names which we need to remove.

3)In the 'Mileage' column there is 'km' written, we need to seperate this 'km' for model building purpose.

4)We need to delete the 'Price' column as we need to predict it.

In [None]:
#Understanding the shape of the data
df.shape

We can see that there are 19237 records and 18 rows

In [None]:
#assigning the target variable
y=df['Price']

In [None]:
#Concatenting both the test and train datasets together so that we can perform all the rectification tasks on both the data together
df = df.drop(['Price'],axis=1)
df_test=df_test.drop(['Price'],axis=1)
df_merge = df.append(df_test)
df_merge.reset_index(inplace=True)
df_merge= df_merge.drop(['index'],axis=1)

In [None]:
#checking the shape of the merged dataset
df_merge.shape

In [None]:
#checking the dtypes and Unique values
info = pd.DataFrame()
info['DataTypes'] = df_merge.dtypes
info['Unique_values'] = df_merge.nunique()
info

We need to convert 'Mileage' column into float as we know that it is of float/integer datatype.

In [None]:
df_merge.describe(include='object')

In [None]:
df_merge.describe(include=np.number)

From the describe() function we can get the mean,count and quantiles values for numeric data and count,frequency of object type data. From the above displays it can be seen that there are no missing values.

# Rectifying the data

In [None]:
#removing the 'km' from the mileage column and converting it to float
df_merge['Mileage'] = pd.to_numeric(df_merge.Mileage.str.split(' ').str[0], downcast='float')

In [None]:
#replacing all the '0' values with the mean values of the 'Mileage' column
df_merge['Mileage'] = np.where(df_merge['Mileage'] == 0.0,df_merge['Mileage'].mean(),df_merge['Mileage'])

In [None]:
#checking the unique values of 'Doors' column
df_merge['Doors'].unique()

In [None]:
#cleaning the Doors column
df_merge['Doors'] = np.where((df_merge['Doors'] == '04-May') | (df_merge['Doors'] == '02-Mar'), df_merge['Doors'].str.split('-').str[0],df_merge['Doors'])

In [None]:
#checking the unique values of 'Doors' column after cleaning
df_merge['Doors'].unique()

In [None]:
#checking the unique values of 'Levy' column after cleaning
df_merge['Levy'].unique()

In [None]:
#converting the Levy column to float as it is the Tax 
df_merge['Levy'] = pd.to_numeric(df_merge['Levy'].replace('-', '0'), downcast='float')

In [None]:
#Replacing the 0 in the 'Levy' column with mean of that column
df_merge['Levy'] = np.where(df_merge['Levy'] == 0.0,df_merge['Levy'].mean(),df_merge['Levy'])

In [None]:
#checking the unique values in the 'Engine volume' column
df_merge['Engine volume'].unique()

In [None]:
#We can see that there are some values with 'Turbo' and some values without 'Turbo'
#So we remove the word 'Turbo' from all records that have it
df_merge['Engine volume'] = pd.to_numeric(df_merge['Engine volume'].str.split(' ').str[0], downcast='float')

In [None]:
#Replacing the '0' in the 'Engine volume' column with the mean value of that column
df_merge['Engine volume']=np.where(df_merge['Engine volume'] == 0.0,df_merge['Engine volume'].mean(),df_merge['Engine volume'])

In [None]:
#Feature engineering the production year column
import datetime as dt
currt_time = dt.datetime.now()
df_merge['Prod. year'] = currt_time.year - df_merge['Prod. year'] 

In [None]:
#Checking the dataset after all the retification
df_merge.head()

# Extrapolatory Data Analysis 

In [None]:
sns.heatmap(df_merge.isnull(),cbar=False)
plt.show

We can see that there are no missing values

In [None]:
sns.heatmap(df_merge.corr(), cbar=True, annot=True)


We can see that 'Engine volume' is having high correlation with 'Cylinders' and 'Levy' columns. 

In [None]:
#distribution of numeric variables
df_merge.hist()
plt.tight_layout()
plt.show()

We can see that 'Prod. year','Levy' and 'Engine volume' columns are right skewed.

In [None]:
#shapiro test to check the skewness of the target variable
from scipy.stats import shapiro
x = shapiro(y)
if x[1] <= 0:
    print('Negatively skewed')
else:
    print('Positively Skewed')
  

In [None]:
#As from the shapiro test we can see that 'Price' column is negatively skewed we need to normlize it
y = np.log(y)

# Building the model

In [None]:
categ = df_merge.select_dtypes(include='object')
num = df_merge.select_dtypes(include = np.number)

In [None]:
#getting dummies for the categorical variables
cat_dummies = pd.get_dummies(categ,drop_first=True)

In [None]:
#creating the final dataset
df_final = pd.concat([num,cat_dummies], axis=1)

In [None]:
#checking the shape of the final dataset
df_final.shape

In [None]:
#segregating the training and test data before model building
train_data = df_final.iloc[:19237]
train_data.shape

In [None]:
test_data = df_final.iloc[19237:]
test_data.shape

In [None]:
#splitting the data into test and train
X = train_data
Y=y

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=10)


In [None]:
#Randomized Search CV for searching the best parameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
rf_model = RandomForestRegressor()

In [None]:
rf_random_model = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random_model.fit(X_train,y_train)

In [None]:
#getting the best parameters
rf_random_model.best_params_

#  Feature selection using  extra tree regressor

Since we need to find features to train the model so that it neither gets underfitted or overfitted, we use feature selection technique.The best feature selection technique that worked for this problem statement is using extra tree regressor.

In [None]:
reg= ExtraTreesRegressor()
reg.fit(X_train,y_train)
ExtraTreesRegressor()

In [None]:
#finding important features
feat_importances = pd.Series(reg.feature_importances_, index=X_train.columns)

In [None]:
pd.DataFrame(feat_importances.nlargest(30)).index

In [None]:
#instantiating the randomforest regressor using the best parameters
mod4 = RandomForestRegressor(n_estimators= 1000, max_depth= 25,
 max_features= 'sqrt',
 min_samples_leaf=1,
 min_samples_split = 2
 )

In [None]:
X1 = train_data[['Airbags', 'Mileage', 'Prod. year', 'ID', 'Gear box type_Tiptronic',
       'Leather interior_Yes', 'Levy', 'Fuel type_Diesel', 'Engine volume',
       'Manufacturer_HYUNDAI', 'Fuel type_Hybrid', 'Color_White',
       'Color_Black', 'Drive wheels_Front', 'Model_FIT', 'Color_Grey',
       'Color_Silver', 'Cylinders', 'Wheel_Right-hand drive', 'Category_Sedan',
       'Manufacturer_TOYOTA', 'Category_Jeep', 'Gear box type_Variator',
       'Manufacturer_SSANGYONG', 'Fuel type_Petrol', 'Drive wheels_Rear',
       'Model_Prius']]
y1=y

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, test_size=0.3, random_state=10)

In [None]:
#fitting the model
model = mod4.fit(X1_train, y1_train)

In [None]:
#predicting the data
y_predict=model.predict(X1_test)

In [None]:
#calculating the RMLSE score
RMLSE=np.sqrt(mean_squared_log_error(np.exp(y1_test),np.exp(y_predict)))

In [None]:
#Printing the RMLSE score
RMLSE

# Finding best features using random forest regressor

In [None]:
mod3 = RandomForestRegressor(n_estimators= 1000, max_depth= 25,
 max_features= 'sqrt',
 min_samples_leaf=1,
 min_samples_split = 2
 )

In [None]:
model_random = mod3.fit(X_train, y_train)

In [None]:
feat_importances = pd.Series(model_random.feature_importances_, index=X_train.columns)

In [None]:
pd.DataFrame(feat_importances.nlargest(50)).index

In [None]:
X2=train_data[['Airbags', 'Mileage', 'Prod. year', 'ID', 'Gear box type_Tiptronic',
       'Leather interior_Yes', 'Levy', 'Fuel type_Diesel', 'Engine volume',
       'Manufacturer_HYUNDAI', 'Fuel type_Hybrid', 'Color_White',
       'Color_Black', 'Drive wheels_Front', 'Model_FIT', 'Color_Grey',
       'Color_Silver', 'Cylinders', 'Wheel_Right-hand drive', 'Category_Sedan',
       'Manufacturer_TOYOTA', 'Category_Jeep', 'Gear box type_Variator',
       'Manufacturer_SSANGYONG', 'Fuel type_Petrol', 'Drive wheels_Rear',
       'Model_Prius', 'Color_Blue', 'Category_Hatchback']]
Y2=y

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,Y2, test_size=0.3, random_state=10)

In [None]:
model1 = mod4.fit(X2_train, y2_train)

In [None]:
y_pred=model1.predict(X2_test)

In [None]:
RMLSE1=np.sqrt(mean_squared_log_error(np.exp(y2_test),np.exp(y_pred)))

In [None]:
RMLSE1