In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ****Exploratory Data Analysis 
Today we will make a exploratory data analisis from a used car catalog dataset


In [None]:
#first import your dataset
D=pd.read_csv('../input/usedcarscatalog/cars.csv')
#convert it into a pandas dataframe
df=pd.DataFrame(data=D)
#display the first 5 rows of the data
df.head()

In [None]:
#Describe your data before start using it
df.describe()

In [None]:
#We count all the missing values from our dataset
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
#as you can see, we only have 10 missing values on the column 
#engine_capacity
#let's calculate the mean of the column 
#so we can replace the missing values with the mean of the column
mean=df['engine_capacity'].mean()
mean

In [None]:
#we replace missing values with the mean
df['engine_capacity'].replace(np.nan,mean, inplace=True)
#now as you can see we don't have any missing values on the column engine_capacity
import missingno as msno
msno.matrix(df)


In [None]:
#we can make some useful graphs in order to get useful insights
#we can see the automatic trasmission cars are more expensive than mechanical in average
import seaborn as sns
sns.boxplot(x="transmission", y="price_usd", data=df)

In [None]:
#in order to group data so that we can get more helpful insights we use the 
#we group the data in bins 
bins= np.linspace(min(df['engine_capacity']), 
                  max(df['engine_capacity']),4)
#define the names for our 3 groups of bins
group_names=['Low','Medium','High']
#
df['engine_binned'] =pd.cut(df['engine_capacity'], 
                            bins, labels=group_names, 
                            include_lowest=True)

#now we can make a boxplot graph to see how it looks like

sns.boxplot(x="engine_binned", y="price_usd", data=df)

In [None]:
#we can also see the engine capacity has been grouped into those categories
#in the next dataframe
df[['engine_capacity','engine_binned']].head()

We drop the features as there is no description of what they are and we can not make conclusions without knowing what are those features

In [None]:
df.drop('feature_0', axis=1, inplace=True)
df.drop('feature_1', axis=1, inplace=True)
df.drop('feature_2', axis=1, inplace=True)
df.drop('feature_3', axis=1, inplace=True)
df.drop('feature_4', axis=1, inplace=True)
df.drop('feature_5', axis=1, inplace=True)
df.drop('feature_6', axis=1, inplace=True)
df.drop('feature_7', axis=1, inplace=True)
df.drop('feature_8', axis=1, inplace=True)
df.drop('feature_9', axis=1, inplace=True)

# Correlation

Here with this built function, we can see the correlation between all the variables in our dataset, take into account a correlation value close to 1 means that both variables have a positive correlation and if the value is close to -1 that means there is a negative correlation between both variables.

In [None]:
corr= df.corr()
corr

In [None]:
#correlation map between all the variables
import matplotlib.pyplot as plt

fig= plt.figure(figsize=(8,8))
plt.matshow(corr,cmap='RdBu', fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns);


In [None]:
##Examples of positive and negative correlation

In [None]:
#negative correlation, as the odometer value is higher the prices decreases
df.plot(kind='scatter', x='odometer_value', y='price_usd', figsize=(6,6))

In [None]:
#positive correlation because the newest the car is the higher the price is as well.
df.plot(kind='scatter', x='year_produced', y='price_usd', figsize=(6,6))