In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Description

|Feature                 |Description|
|------------------------|-------------------------------------------------------------------------|
|maker                   |normalized all lowercase|
|model                   |normalized all lowercase|
|mileage                 |in KM|
|manufacture_year        |year|
|engine_displacement     |in ccm|
|engine_power            |in kW|
|body_type               |almost never present, but I scraped only personal cars, no motorcycles or utility vehicles|
|color_slug              |also almost never present|
|stk_year                |year of the last emission control|
|transmission            |automatic or manual|
|door_count              |                   |
|seat_count              |                   |
|fuel_type               |gasoline, diesel, cng, lpg, electric|
|date_created            |when the ad was scraped|
|datelastseen            |when the ad was last seen. Our policy was to remove all ads older than 60 days|
|price_eur               |list price converted to EUR|

<h2 style="color:blue" align="blue"> 1. Loading Required Liabriaries </h2>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from scipy import stats

<h2 style="color:blue" align="blue"> 2. Read Data </h2>

In [None]:
cars = pd.read_csv("/kaggle/input/personal-cars-classifieds/all_anonymized_2015_11_2017_03.csv")
cars.head()

In [None]:
# Check the last 5 rows
cars.tail()

In [None]:
cars.shape

In [None]:
cars.columns

In [None]:
cars.info()

### Data Cleaning

In [None]:
filename = "/kaggle/input/personal-cars-classifieds/all_anonymized_2015_11_2017_03.csv"

dtypes = {
    "maker": str, # brand name
    "model": str,
    "mileage": float, # km
    "manufacture_year": float,
    "engine_displacement": float,
    "engine_power": float,
    "body_type": str, # almost never present
    "color_slug": str, # also almost never present
    "stk_year": str,
    "transmission": str, # automatic or manual
    "door_count": str,
    "seat_count": str,
    "fuel_type": str, # gasoline or diesel
    "date_created": str, # when the ad was scraped
    "date_last_seen": str, # when the ad was last seen
    "price_eur": float} # list price converted to EUR

df = pd.read_csv(filename, dtype=dtypes)
print(f"Raw data has {df.shape[0]} rows, and   {df.shape[1]} columns")

In [None]:
df.describe()

<h2 style="color:blue" align="blue"> 3. EDA(Exploratory Data Analysis) </h2>

### a) Missing Values

In [None]:
df.isnull().sum()

### b) Correlation Matrix

In [None]:
plt.figure(figsize=(15,14))
sns.heatmap(cars.corr(), annot=True, cmap='viridis', fmt='.1f', linewidth=1, square=True)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

### c) Find Outliers

In [None]:
plt.figure(figsize=(25,13))
sns.boxplot(data=cars)

plt.title("Classified Ads for cars", fontsize=18, fontweight='bold')

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()

## Detection of outliers based on Distributions
### i. Normally Distributed data :
- In case of normal distribution if datapoints lie away from the range (μ + 3σ) and (μ — 3 σ) is considered as outliers.

In [None]:
# Plotting and observing whether dataset has outliers or not
plt.figure(figsize=(16, 4)) # figure size
plt.subplot(1, 3, 1) # multiple plot plottig and 1st position of fig
sns.distplot(df["price_eur"], bins=30) # checking is data normally dist ?
plt.title('Histogram') 
plt.subplot(1, 3, 2)  # plotting second position of figure
sns.boxplot(y=df["price_eur"]) # boxplot
plt.title('Boxplot')
plt.subplot(1, 3, 3) # plotting third position of figure
stats.probplot(df["price_eur"], dist="norm", plot=plt) # q-q plot to check #how our data is distributed in reference with normal distribution
plt.ylabel('RM quantiles')
plt.show()

### To find minimum and maximum boundary value

In [None]:
# outlier boundary value for normally distributed dataset
def min_max_boundary(data,col):
    min_value = df[col].mean()-3*df[col].std()
    max_value = df[col].mean()+3*df[col].std()
    return min_value,max_value
min_max_boundary(df,"price_eur")

### Removing outliers :

In [None]:
# filtering all the value mabove maximum boundary value and below minimum boundary value 
df = df[(df["price_eur"] >100) & (df["price_eur"] < 50)]

# plotting the df["price_eur"] after removing outliers
plt.figure(figsize=(16, 4)) 
plt.subplot(1, 3, 1)  
sns.distplot(df["price_eur"], bins=30)  
plt.title('Histogram') 
plt.subplot(1, 3, 2)   
sns.boxplot(y=df["price_eur"]) 
plt.title('Boxplot')
plt.subplot(1, 3, 3) 
stats.probplot(df["price_eur"], dist="norm", plot=plt)
plt.ylabel('RM quantiles')
plt.show()

## ii. Skewed Distributed data :
- If value doesnot lie in between the range :- 25percentile- (1.5*IQR) & 75percentile+(1.5*IQR) then datpoint is considered as outliers.
- Here IQR= Q3-Q1

In [None]:
# plotting different plot to analyse presence of outliers
plt.figure(figsize=(16, 4)) # figure size
plt.subplot(1, 3, 1) # multiple plot plottig and 1st position of figure
sns.distplot(df["engine_power"], bins=30) # checking data is normally distributed or not
plt.title('Histogram',fontsize=20) 
plt.subplot(1, 3, 2)  # plotting second position of figure
sns.boxplot(y=df["engine_power"]) # boxplot
plt.title('Boxplot',fontsize=20)
plt.subplot(1, 3, 3) # plotting third position of figure
stats.probplot(df["engine_power"], dist="norm", plot=plt) # q-q plot to check how our data is distributed in reference with normal distribution
plt.title("Q-Q plot",fontsize=20)
plt.show()

### To find minimum and maximum boundary value

In [None]:
#finding upper and lower boundary limit
def non_normal_outliers(data,col):
    IQR=df[col].quantile(0.75)-df[col].quantile(0.25)
    lower_limit=data[col].quantile(0.75) + (1.5*IQR)
    upper_limit=data[col].quantile(0.25) - (1.5*IQR)
    return "lower limit of dataset : {0},  upper limit of dataset 
            {1}".format(lower_limit,upper_limit)
  non_normal_outliers(df,"engine_power")

### Removing outliers :

In [None]:
#filtering values lie above and below min and max value
df = df.loc[(df["engine_power"]<list1[0]) & (df["engine_power"]>list1[1])]
--------------------------------------------------------------------
#plotting the dataset after eliminating outliers
plt.figure(figsize=(16, 4)) 
plt.subplot(1, 3, 1)  
sns.distplot(df["engine_power"], bins=30)  
plt.title('Histogram') 
plt.subplot(1, 3, 2)   
sns.boxplot(y=df["engine_power"]) 
plt.title('Boxplot')
plt.subplot(1, 3, 3) 
stats.probplot(df["engine_power"], dist="norm", plot=plt)
plt.ylabel('engine_power')
plt.show()

### d) Skew and Kurtosis

In [None]:
cars.skew()

In [None]:
cars["mileage"].skew()

In [None]:
cars["mileage"].kurt()

<h2 style="color:blue" align="blue"> 4. Data Preprocessing </h2>

### a) maker

In [None]:
cars['maker'].nunique()

In [None]:
plt.figure(figsize=(25,15))
sns.countplot(cars['maker'])

plt.xlabel('Maker', fontsize=25, fontweight='bold')
plt.ylabel('count', fontsize=25, fontweight='bold')

plt.title('Maker Vs Count', fontsize=30, fontweight='bold')

plt.xticks(rotation=75, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

print(cars['maker'].value_counts())

### b) model

In [None]:
cars['model'].nunique()

In [None]:
plt.figure(figsize=(25,15))
sns.countplot(cars['model'])

plt.xlabel('model', fontsize=25, fontweight='bold')
plt.ylabel('count', fontsize=25, fontweight='bold')

plt.title('model Vs Count', fontsize=30, fontweight='bold')

plt.xticks(rotation=75, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

print(cars['model'].value_counts())

### c. door_count

In [None]:
cars['door_count'].nunique()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(cars['door_count'])

plt.xlabel('door_count', fontsize=25, fontweight='bold')
plt.ylabel('count', fontsize=25, fontweight='bold')

plt.title('door_count Vs Count', fontsize=30, fontweight='bold')

plt.xticks(rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

print(cars['door_count'].value_counts())

### d.seat_count

In [None]:
cars['seat_count'].nunique()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(cars['seat_count'])

plt.xlabel('seat_count', fontsize=25, fontweight='bold')
plt.ylabel('count', fontsize=25, fontweight='bold')

plt.title('seat_count Vs Count', fontsize=30, fontweight='bold')

plt.xticks(rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

print(cars['seat_count'].value_counts())

### e.fuel_type

In [None]:
cars['fuel_type'].nunique()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(cars['fuel_type'])

plt.xlabel('fuel_type', fontsize=25, fontweight='bold')
plt.ylabel('count', fontsize=25, fontweight='bold')

plt.title('fuel_type Vs Count', fontsize=30, fontweight='bold')

plt.xticks(rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

print(cars['fuel_type'].value_counts())