### Importing the required libraries ###

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Loading the data into the data frame ###

In [None]:
df = pd.read_csv('../input/cardataset/data.csv')
#top five rows
df.head(5)

In [None]:
#bottom 5 rows
df.tail(5)

### Checking the types of data ###

#### Here we check for the datatypes because sometimes the MSRP or the price of the car would be stored as a string, if in that case, we have to convert that string to the integer data only then we can plot the data via a graph. Here, in this case, the data is already in integer format so nothing to worry.####

In [None]:
df.dtypes

### Dropping irrelevant colomns ###


#### This step is certainly needed in every EDA because sometimes there would be many columns that we never use, in such cases dropping is the only solution.#### 

In [None]:
df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors','Vehicle Size'], axis=1)
df.head(5)

### Renaming the columns- ###


#### In this instance, most of the columns names are very confusing to read, so I just tweaked their columns names. ####

In [None]:
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)

### Dropping the duplicate rows- ###


#### This is often a handly thing to do because a huge dataset have some duplicate data which might be disturbing, so here I remove all duplicate values.

In [None]:
df.shape

In [None]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

In [None]:
# dropping duplicate rows
df = df.drop_duplicates()
df.head(5)

### Dropping the missing or null values ###


#### This approach is not to do good so, beacause many people just replace the missing values with the mean or the average of that columns. ####

In [None]:
print(df.isnull().sum())

#### Here we can see the both cylinders and HP had less count.Therefore dropping the missing values. ####

In [None]:
df = df.dropna()              # dropping the values
df.count()

In [None]:
print(df.isnull().sum())       # after dropping the values

### Detecting outliers ###


#### It is good to detect and remove the outliers, because outliers are one of the primary reason for resulting in a less accurate model. ####


In [None]:
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

#### Now I'm going to perform is called Interquartile range(IQR) score technique.####

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

### Data visualization using scatter and histogram-###

### 1. Histogram- ###


#### Histogram is one of the best solution which lets us know the total Number of car manufactured. ####

In [None]:
df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make');


### Scatter plot- ###


#### To find the correlation between two variables.####

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

## Heatmap- ##


In [None]:
plt.figure(figsize=(10,5))
c = df.corr()
sns.heatmap(c, cmap="BrBG", annot=True )

#### One of the best way to find the relationship between the features. ####





#### To be continued....... ####