In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Update pandas to display all columns in the output
pd.options.display.max_columns = None

In [None]:
data_path = '/kaggle/input/car-price-prediction/CarPrice_Assignment.csv'

# Load data into DataFrame

In [None]:
df = pd.read_csv(data_path)

In [None]:
df.head()

In [None]:
df.shape

There are 205 rows and 26 columns in the dataset.

In [None]:
# Check column names and data types in the dataset
df.info()

From above we can see that we have mixed data types for columns.

| DataType | Count
| :--- | ---:
| Numerical - Int | 8
| Numerical - Float | 8
| Categorical | 10

In [None]:
# Let's see unique values in each columns
df.nunique()

From above, it is clear that 'car_ID' column can be dropped, because having unique value for each row doesn't help in inferences.

In [None]:
# Let's describe the data for numerical columns
df.describe()

From above, we can hint that few columns have outliers in it.

In [None]:
# Describe data for categorical columns
df.describe(include='object')

In [None]:
# Check for duplicates
df.duplicated().sum()

No row is duplicated in the dataset.

Now, let's drop first 'car_ID' column.

In [None]:
df.drop('car_ID', axis=1, inplace=True)

In [None]:
df.shape

Let's check 'carname' column and see if we can fetch brand name from the name.

In [None]:
df['CarName'].head(10)

Let's split the CarName values on space and take first word as brand name.

In [None]:
df['CarBrand'] = df.CarName.apply(lambda s: s.split()[0])

In [None]:
df["CarBrand"].value_counts()

In [None]:
brand_map = {
    'toyouta':'toyota',
    'Nissan':'nissan',
    'maxda':'mazda',
    'vokswagen':'volkswagen',
    'vw':'volkswagen',
    'porcshce':'porsche'
}

In [None]:
df['CarBrand'] = df.CarBrand.apply(lambda s: brand_map[s] if s in brand_map else s)

In [None]:
df['CarBrand'].value_counts()

Now, we can remove column 'CarName'

In [None]:
df.drop('CarName', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# Check if there are any missing values
df.isna().sum()

From above, it is clear that there is no missing value in any column. 

In [None]:
df.loc[:,df.dtypes=='object'].head()

## Plots for columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df['price'])

In [None]:
sns.boxplot(x=df['price'])

In [None]:
df['price'].describe()

As above plots and description shows that Mean > Median, it is right skewed and there are few outliers because of which it is right skewed.

In [None]:
df.columns

In [None]:
sns.countplot(df['symboling'])

In [None]:
sns.boxplot(y='price', x='symboling', data=df)

It can be observed from plot that Car price varies on different values of symboling.

In [None]:
sns.boxplot(y='price', x='fueltype', data=df)

Car prices of fuel type 'gas' has lower prices compared to 'diesel' types.

In [None]:
sns.boxplot(y='price', x='aspiration', data=df)

In [None]:
plt.figure(figsize=(12,8))
corr = df.corr()
sns.heatmap(corr, annot=True)

In [None]:
cat_cols = df.columns[df.dtypes.isin(['object', 'int64'])]
cat_cols

In [None]:
numerical_cols = df.columns[df.dtypes == 'float64']
numerical_cols

## Plots for categorical features

In [None]:
print(f'Total categorical columns: {len(cat_cols)}')

In [None]:
ncols = 4
nrows = int(np.ceil(len(cat_cols) / ncols))
total_cols = len(cat_cols)
print(total_cols, nrows, ncols)

In [None]:
# plt.figure(figsize=(20,20))
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, sharey = True, figsize=(20,20))
for i, col in enumerate(cat_cols):
    row_index = i // ncols
    col_index = i % ncols
#     print(col_index)
    sns.boxplot(y='price', x=col, data=df, ax=axs[row_index, col_index])
plt.tight_layout()
plt.show()