# Imoprting required Libraries and Data Set

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [1]:
data = pd.read_csv("/kaggle/input/toyata-cars-dataset/toyata.txt", na_values = ("??", "????"), index_col = 0 )
data

Dataset Description:

Price     : Price of the Car(Dependent Varible)
Age       : Age of the Car.
KM        : Car runnig distance in KiloMeters.
Fueltype  : Which type fuel used in car like Petrol, CNG, Diesel.
HP        : Horsepower of the car.
Metcolor  : Color Type of the car.
Automatic : Car gear system automatic(0: Manullay, 1: Automatic).
CC        : cubic centimetres, car engine size describe.
Doors     : No.of Doors of the Car.
Weight    : Weight of the car.

In [1]:
data.shape

In [1]:
data.info()

In [1]:
data.isnull().sum()

In [1]:
print(data.Doors.unique())
print("Doors Feature Data type :", data.Doors.dtype)

In [1]:
doors_map = {"2":2,"3":3,"4":4,"5":5, "three":3, "four":4,"five":5,}
data["Doors"] = data["Doors"].map(doors_map)

In [1]:
data['Age'] = data["Age"].fillna(data.Age.mean())
data["KM"] = data["KM"].fillna(data.KM.mean())
data["HP"] = data["HP"].fillna(data.HP.mean())
data["MetColor"] = data["MetColor"].fillna(data.MetColor.mode()[0])
data["FuelType"] = data["FuelType"].fillna(data.FuelType.mode()[0])

In [1]:
data

# Detect Outliers:

a) Box and whisker plot (box plot).

b) Scatter plot.

c) Histogram.

d) Distribution Plot.

e) QQ plot.

## i) Univariate method:

a) Box and whisker plot (box plot).

b) Scatter plot.
    
## ii) Multivariate method:

a) Histogram.

b) Distribution Plot.

c) QQ plot.


### Univariate method:
    
#### a) Box plot and whisker plot(Box plot)

In [1]:
sns.boxplot(data['KM'])
plt.title("Box plot of KM")
plt.show()

In [1]:
sns.boxplot(data['HP'])
plt.title("Box plot of HP")
plt.show()

### b) Scatter plot

In [1]:
plt.scatter(data['KM'], data['Weight'])
plt.title("Scatter plot KM vs Weight")
plt.show()

In [1]:
plt.scatter(data['KM'], data['Price'])
plt.title("Scatter plot KM vs Price")
plt.show()

## Multivariate method:

### a) Histogram

In [1]:
plt.hist(data["Price"])
plt.title("Histogram plot of Price")
plt.show()

In [1]:
plt.hist(data["KM"])
plt.title("Histogram plot of KM")
plt.show()

### b) Distribution Plot

In [1]:
sns.distplot(data['KM'])
plt.title("Distribution Plot of KM")
plt.show()

In [1]:
sns.distplot(data['Age'])
plt.title("Distribution Plot of Age")
plt.show()

### c) Q-Q plot.

In [1]:
import statsmodels.api as sm
sm.qqplot(data["Price"], line = 's')
plt.title("Normal Q-Q plot")
plt.show()

# Handling Outliers:

1. Deleting observations.

2. Transforming values.

3. Imputation.

## 1.Deleting observations.

In [1]:
sns.boxplot(data["KM"])
plt.title("Box plot before Outlier removing")
plt.show()

def drop_outliers(data, KM):
    iqr = 1.5 * (np.percentile(data['KM'], 75) - np.percentile(data["KM"],25))
    data.drop(data[data['KM'] > iqr + np.percentile(data['KM'], 75)]. index, inplace = True)
    data.drop(data[data['KM'] < np.percentile(data['KM'], 25)- iqr ]. index, inplace = True)

drop_outliers(data, 'KM')

sns.boxplot(data["KM"])
plt.title("Box plot After Outlier removing")
plt.show()


In [1]:
sns.boxplot(data["Age"])
plt.title("Box plot before Outlier removing")
plt.show()

def drop_outliers(data, Age):
    iqr = 1.5 * (np.percentile(data['Age'], 75) - np.percentile(data["Age"],25))
    data.drop(data[data['Age'] > iqr + np.percentile(data['Age'], 75)]. index, inplace = True)
    data.drop(data[data['Age'] < np.percentile(data['Age'], 25)- iqr ]. index, inplace = True)

drop_outliers(data, 'Age')

sns.boxplot(data["Age"])
plt.title("Box plot After Outlier removing")
plt.show()


# 2. Transforming values:

1) Scaling.

2) Log transformation.

3) Cube Root Normalization.

4) Box-transformation.

### 1) Scaling.

In [1]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [1]:
plt.hist(data["KM"])
plt.title("Histogram plot before scaling")
plt.show()

data["KM"] = scaler.fit_transform(data["KM"].values.reshape(-1,1))

plt.hist(data["KM"])
plt.title("Histogram Plot afrter Scaling")
plt.show()

### 2) Log transformation.

In [1]:
sns.distplot(data["Price"])
plt.title("Distribution plot before Log Transform")
plt.show()

data["Price"] = np.log(data["Price"])

sns.distplot(data["Price"])
plt.title("Distribution Plot afrter Transform")
plt.show()

### 3) Cube Root Normalization

In [1]:
plt.hist(data["Age"])
plt.title("Histogram plot before Cube Root Normalization ")
plt.show()

data['Age'] = (data['Age'] ** (1/3))

plt.hist(data["Age"])
plt.title("Histogram Plot afrter Cube Root Normalization")
plt.show()

### 4) Box-transformation.

In [1]:
import scipy

sns.boxplot(data['Price'])
plt.title("Box plot before Box - Transformation")
plt.show()

data['Price'], fitted_lambda = scipy.stats.boxcox(data['Price'],lmbda = None)

sns.boxplot(data['Price'])
plt.title("Box plot After Box - Transformation")
plt.show()

# 3. Imputation:
1) Mean.

2) Median.

3) Zero

## 1) Mean

In [1]:
sns. boxplot(data['CC'])
plt.title("Box plot Before mean imputation")
plt.show()

for i in data['CC']:
    q1 = data['CC'].quantile(0.25)
    q3 = data['CC'].quantile(0.75)
    iqr =q3 - q1
    lower_tail = q1 - 1.5 * iqr
    upper_tail = q3 + 1.5 * iqr
    if i > upper_tail or i < lower_tail:
        data['CC'] = data['CC'].replace(i, np.mean(data['CC']))
    
sns.boxplot(data['CC'])
plt.title("Box plot after mean imputation")
plt.show()

## 2) Median

In [1]:
sns. boxplot(data['HP'])
plt.title("Box plot Before mean imputation")
plt.show()

for i in data['HP']:
    q1 = data['HP'].quantile(0.25)
    q3 = data['HP'].quantile(0.75)
    iqr =q3 - q1
    lower_tail = q1 - 1.5 * iqr
    upper_tail = q3 + 1.5 * iqr
    if i > upper_tail or i < lower_tail:
        data['HP'] = data['HP'].replace(i, np.median(data['HP']))
    
sns.boxplot(data['HP'])
plt.title("Box plot after mean imputation")
plt.show()

## 3) Zero

In [1]:
sns. boxplot(data['Age'])
plt.title("Box plot Before mean imputation")
plt.show()

for i in data['Age']:
    q1 = data['Age'].quantile(0.25)
    q3 = data['Age'].quantile(0.75)
    iqr =q3 - q1
    lower_tail = q1 - 1.5 * iqr
    upper_tail = q3 + 1.5 * iqr
    if i > upper_tail or i < lower_tail:
        data['Age'] = data['Age'].replace(i, 0)
    
sns.boxplot(data['Age'])
plt.title("Box plot after mean imputation")
plt.show()