## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

## Reading File

In [None]:
data=pd.read_csv('../input/used-car-dataset-ford-and-mercedes/cclass.csv')
data.head()

## Data Exploration

In [None]:
data.info()

In [None]:
data.describe()

### Null Values

In [None]:
data.isna().sum()

### Duplicate Values

In [None]:
data.duplicated().sum()

Duplicate data are of no use. So, it's better to remove them.

In [None]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

## EDA

In [None]:
data.plot(kind='box',subplots=True,layout=(3,2),figsize=(15,7))

There are a-lot of outliers but, as we know these variables are related to each other so I am not going to remove the outliers.

### Model

Let's check for model variable...

In [None]:
data['model'].nunique()

In [None]:
data['model'].unique()

In [None]:
data['model'].value_counts()

Model attribute is of no use, as it has only value and gonna be same for observations.

### Year

Let's check for year attribute

In [None]:
data['year'].nunique()

In [None]:
data['year'].unique()

In [None]:
data['year'].value_counts()

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(data=data,x='year')

In [None]:
sns.regplot(data=data,x='year',y='price')

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data=data,x='year',y='price')

In [None]:
sns.distplot(data['year'])

### Transmission

Let's check for transmission

In [None]:
data['transmission'].nunique()

In [None]:
data['transmission'].value_counts()

In [None]:
sns.countplot(data=data,x='transmission')

In [None]:
sns.boxplot(data=data,x='transmission',y='price')

### Mileage

Let's check mileage

In [None]:
data['mileage'].nunique()

In [None]:
sns.regplot(data=data,x='mileage',y='price')

In [None]:
sns.distplot(data['mileage'])

A trend for when Mileage decreases decreases price increases is visible

### Fuel Type

Let's observe fuel type

In [None]:
data['fuelType'].nunique()

In [None]:
data['fuelType'].unique()

In [None]:
data['fuelType'].value_counts()

In [None]:
sns.countplot(x=data['fuelType'])

In [None]:
sns.boxplot(data=data,x='fuelType',y='price')

In [None]:
data[data['fuelType']=='Petrol'][['year','price','engineSize','transmission','mileage']].sort_values(by='year')

In these above data-frame extract we can see that Price is effected by year, engineSize and mileage which also proves the hypothesis of not deleting outliers is right. Let's go further.

### Engine Size

Let's check on engineSize

In [None]:
data['engineSize'].nunique()

In [None]:
sns.distplot(data['engineSize'])

In [None]:
sns.countplot(data=data,x='engineSize')

In [None]:
plt.figure(figsize=(15,7))
sns.boxplot(data=data,x='engineSize',y='price')

## Feature engineering

In [None]:
data.drop('model',axis=1,inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
cols=data.select_dtypes(include=['object']).columns
cols

In [None]:
data[cols]=data[cols].apply(le.fit_transform)

In [None]:
data.corr()

In [None]:
from sklearn.feature_selection import SelectKBest,f_regression

In [None]:
sns.heatmap(data.corr(),annot=True,fmt='.2f')

In [None]:
feature=SelectKBest(score_func=f_regression,k='all').fit(data.drop('price',axis=1),data['price'])
feature=pd.DataFrame(data=feature.scores_,index=[data.drop('price',axis=1).columns])
feature.sort_values(by=0,ascending=False)

In [None]:
cols=['mileage','year','engineSize','fuelType']
cols

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

In [None]:
train_x,test_x,train_y,test_y=train_test_split(data[cols],data['price'],test_size=0.2,random_state=None)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfc= RandomForestRegressor(verbose=1,n_estimators=10000,max_depth=9
                           ,n_jobs=-1)
rfc.fit(train_x,train_y)
predict=rfc.predict(test_x)
print('R2 ',r2_score(predict,test_y)*100)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgr=XGBRegressor(n_estimators=100000,learning_rate=0.001,n_jobs=-1,max_depth=5)
xgr.fit(train_x,train_y)
predict=xgr.predict(test_x)
print('R2 ',r2_score(predict,test_y)*100)