# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 6, 'display.max_rows', 6, 'display.max_colwidth', 12)

## Introduction

## Summary Statistics

### How to do it...

In [2]:
fueleco = pd.read_csv('../data/vehicles.csv.zip', low_memory=False)
fueleco.head()

Unnamed: 0,barrels08,barrelsA08,charge120,...,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,...,0,0,0
1,29.964545,0.0,0.0,...,0,0,0
2,12.207778,0.0,0.0,...,0,0,0
3,29.964545,0.0,0.0,...,0,0,0
4,17.347895,0.0,0.0,...,0,0,0


In [3]:
fueleco.mean() # doctest: +SKIP

TypeError: can only concatenate str (not "int") to str

In [4]:
fueleco.std() # doctest: +SKIP

TypeError: could not convert string to float: 'Rear-Wheel Drive'

In [5]:
fueleco.quantile([0, .25, .5, .75, 1]) # doctest: +SKIP

TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [6]:
fueleco.describe()  # doctest: +SKIP

Unnamed: 0,barrels08,barrelsA08,charge120,...,phevCity,phevHwy,phevComb
count,39101.00...,39101.00...,39101.0,...,39101.00...,39101.00...,39101.00...
mean,17.442712,0.219276,0.0,...,0.094703,0.094269,0.094141
std,4.580230,1.143837,0.0,...,2.279478,2.191115,2.226500
...,...,...,...,...,...,...,...
50%,17.347895,0.000000,0.0,...,0.000000,0.000000,0.000000
75%,20.115000,0.000000,0.0,...,0.000000,0.000000,0.000000
max,47.087143,18.311667,0.0,...,97.000000,81.000000,88.000000


In [7]:
fueleco.describe(include=object)  # doctest: +SKIP

Unnamed: 0,drive,eng_dscr,fuelType,...,createdOn,modifiedOn,startStop
count,37912,23431,39101,...,39101,39101,7405
unique,7,545,14,...,195,68,2
top,Front-Wh...,(FFS),Regular,...,Tue Jan ...,Tue Jan ...,N
freq,13653,8827,25620,...,34199,29438,5176


### How it works...

### There's more...

In [8]:
fueleco.describe().T    # doctest: +SKIP

Unnamed: 0,count,mean,std,...,50%,75%,max
barrels08,39101.0,17.442712,4.580230,...,17.347895,20.115,47.087143
barrelsA08,39101.0,0.219276,1.143837,...,0.000000,0.000,18.311667
charge120,39101.0,0.000000,0.000000,...,0.000000,0.000,0.000000
...,...,...,...,...,...,...,...
phevCity,39101.0,0.094703,2.279478,...,0.000000,0.000,97.000000
phevHwy,39101.0,0.094269,2.191115,...,0.000000,0.000,81.000000
phevComb,39101.0,0.094141,2.226500,...,0.000000,0.000,88.000000


## Column Types

### How to do it...

In [9]:
fueleco.dtypes

barrels08     float64
barrelsA08    float64
charge120     float64
               ...   
phevCity        int64
phevHwy         int64
phevComb        int64
Length: 83, dtype: object

In [10]:
fueleco.dtypes.value_counts()

float64    32
int64      27
object     23
bool        1
Name: count, dtype: int64

### How it works...

### There's more...

In [11]:
fueleco.select_dtypes('int64').describe().T

Unnamed: 0,count,mean,std,...,50%,75%,max
city08,39101.0,18.077799,6.970672,...,17.0,20.0,150.0
cityA08,39101.0,0.569883,4.297124,...,0.0,0.0,145.0
co2,39101.0,72.538989,163.252019,...,-1.0,-1.0,847.0
...,...,...,...,...,...,...,...
phevCity,39101.0,0.094703,2.279478,...,0.0,0.0,97.0
phevHwy,39101.0,0.094269,2.191115,...,0.0,0.0,81.0
phevComb,39101.0,0.094141,2.226500,...,0.0,0.0,88.0


In [12]:
np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [13]:
np.iinfo(np.int16)

iinfo(min=-32768, max=32767, dtype=int16)

In [14]:
fueleco[['city08', 'comb08']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int64
 1   comb08  39101 non-null  int64
dtypes: int64(2)
memory usage: 611.1 KB


In [15]:
(fueleco
  [['city08', 'comb08']]
  .assign(city08=fueleco.city08.astype(np.int16),
          comb08=fueleco.comb08.astype(np.int16))
  .info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int16
 1   comb08  39101 non-null  int16
dtypes: int16(2)
memory usage: 152.9 KB


In [None]:
fueleco.make.nunique()

In [None]:
fueleco.model.nunique()

In [None]:
fueleco[['make']].info()

In [None]:
(fueleco
    [['make']]
    .assign(make=fueleco.make.astype('category'))
    .info()
)

In [None]:
fueleco[['model']].info()

In [None]:
(fueleco
    [['model']]
    .assign(model=fueleco.model.astype('category'))
    .info()
)

## Categorical Data

### How to do it...

In [None]:
fueleco.select_dtypes(object).columns

In [None]:
fueleco.drive.nunique()

In [None]:
fueleco.drive.sample(5, random_state=42)

In [None]:
fueleco.drive.isna().sum()

In [None]:
fueleco.drive.isna().mean() * 100

In [None]:
fueleco.drive.value_counts()

In [None]:
top_n = fueleco.make.value_counts().index[:6]
(fueleco
   .assign(make=fueleco.make.where(
              fueleco.make.isin(top_n),
              'Other'))
   .make
   .value_counts()
)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 8))
top_n = fueleco.make.value_counts().index[:6]
(fueleco     # doctest: +SKIP
   .assign(make=fueleco.make.where(
              fueleco.make.isin(top_n),
              'Other'))
   .make
   .value_counts()
   .plot.bar(ax=ax)
)
fig.savefig('/tmp/c5-catpan.png', dpi=300)     # doctest: +SKIP

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(10, 8))
top_n = fueleco.make.value_counts().index[:6]
sns.countplot(y='make',     # doctest: +SKIP
  data= (fueleco
   .assign(make=fueleco.make.where(
              fueleco.make.isin(top_n),
              'Other'))
  )
)
fig.savefig('/tmp/c5-catsns.png', dpi=300)    # doctest: +SKIP

### How it works...

In [None]:
fueleco[fueleco.drive.isna()]

In [None]:
fueleco.drive.value_counts(dropna=False)

### There's more...

In [None]:
fueleco.rangeA.value_counts()

In [None]:
(fueleco
 .rangeA
 .str.extract(r'([^0-9.])')
 .dropna()
 .apply(lambda row: ''.join(row), axis=1)
 .value_counts()
)

In [None]:
set(fueleco.rangeA.apply(type))

In [None]:
fueleco.rangeA.isna().sum()

In [None]:
(fueleco
  .rangeA
  .fillna('0')
  .str.replace('-', '/')
  .str.split('/', expand=True)
  .astype(float)
  .mean(axis=1)
)

In [None]:
(fueleco
  .rangeA
  .fillna('0')
  .str.replace('-', '/')
  .str.split('/', expand=True)
  .astype(float)
  .mean(axis=1)
  .pipe(lambda ser_: pd.cut(ser_, 10))
  .value_counts()
)

In [None]:
(fueleco
  .rangeA
  .fillna('0')
  .str.replace('-', '/')
  .str.split('/', expand=True)
  .astype(float)
  .mean(axis=1)
  .pipe(lambda ser_: pd.qcut(ser_, 10))
  .value_counts()
)

In [None]:
(fueleco
  .city08
  .pipe(lambda ser: pd.qcut(ser, q=10))
  .value_counts()
)

## Continuous Data

### How to do it...

In [None]:
fueleco.select_dtypes('number')

In [None]:
fueleco.city08.sample(5, random_state=42)

In [None]:
fueleco.city08.isna().sum()

In [None]:
fueleco.city08.isna().mean() * 100

In [None]:
fueleco.city08.describe()

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 8))
fueleco.city08.hist(ax=ax)
fig.savefig('/tmp/c5-conthistpan.png', dpi=300)     # doctest: +SKIP

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 8))
fueleco.city08.hist(ax=ax, bins=30)
fig.savefig('/tmp/c5-conthistpanbins.png', dpi=300)     # doctest: +SKIP

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.distplot(fueleco.city08, rug=True, ax=ax)
fig.savefig('/tmp/c5-conthistsns.png', dpi=300)     # doctest: +SKIP

### How it works...

### There's more...

In [None]:
fig, axs = plt.subplots(nrows=3, figsize=(10, 8))
sns.boxplot(fueleco.city08, ax=axs[0])
sns.violinplot(fueleco.city08, ax=axs[1])
sns.boxenplot(fueleco.city08, ax=axs[2])
fig.savefig('/tmp/c5-contothersns.png', dpi=300)     

In [None]:
from scipy import stats
stats.kstest(fueleco.city08, cdf='norm')

In [None]:
from scipy import stats
fig, ax = plt.subplots(figsize=(10, 8))
stats.probplot(fueleco.city08, plot=ax)
fig.savefig('/tmp/c5-conprob.png', dpi=300)    

## Comparing Continuous Values across Categories

### How to do it...

In [None]:
mask = fueleco.make.isin(['Ford', 'Honda', 'Tesla', 'BMW'])
fueleco[mask].groupby('make').city08.agg(['mean', 'std'])

In [None]:
g = sns.catplot(x='make', y='city08', 
  data=fueleco[mask], kind='box')
g.ax.figure.savefig('/tmp/c5-catbox.png', dpi=300)     

### How it works...

### There's more...

In [None]:
mask = fueleco.make.isin(['Ford', 'Honda', 'Tesla', 'BMW'])
(fueleco
  [mask]
  .groupby('make')
  .city08
  .count()
)

In [None]:
g = sns.catplot(x='make', y='city08', 
  data=fueleco[mask], kind='box')
sns.swarmplot(x='make', y='city08',    # doctest: +SKIP
  data=fueleco[mask], color='k', size=1, ax=g.ax)
g.ax.figure.savefig('/tmp/c5-catbox2.png', dpi=300)    # doctest: +SKIP  

In [None]:
g = sns.catplot(x='make', y='city08', 
  data=fueleco[mask], kind='box',
  col='year', col_order=[2012, 2014, 2016, 2018],
  col_wrap=2)
g.axes[0].figure.savefig('/tmp/c5-catboxcol.png', dpi=300)    # doctest: +SKIP  

In [None]:
g = sns.catplot(x='make', y='city08', # doctest: +SKIP  
  data=fueleco[mask], kind='box',
  hue='year', hue_order=[2012, 2014, 2016, 2018])
g.ax.figure.savefig('/tmp/c5-catboxhue.png', dpi=300)    # doctest: +SKIP  

In [None]:
mask = fueleco.make.isin(['Ford', 'Honda', 'Tesla', 'BMW'])
(fueleco
  [mask]
  .groupby('make')
  .city08
  .agg(['mean', 'std'])
  .style.background_gradient(cmap='RdBu', axis=0)
)

## Comparing Two Continuous Columns

### How to do it...

In [None]:
fueleco.city08.cov(fueleco.highway08)

In [None]:
fueleco.city08.cov(fueleco.comb08)

In [None]:
fueleco.city08.cov(fueleco.cylinders)

In [None]:
fueleco.city08.corr(fueleco.highway08)

In [None]:
fueleco.city08.corr(fueleco.cylinders)

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(8,8))
corr = fueleco[['city08', 'highway08', 'cylinders']].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask=mask,
    fmt='.2f', annot=True, ax=ax, cmap='RdBu', vmin=-1, vmax=1,
    square=True)
fig.savefig('/tmp/c5-heatmap.png', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
fueleco.plot.scatter(x='city08', y='highway08', alpha=.1, ax=ax)
fig.savefig('/tmp/c5-scatpan.png', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
fueleco.plot.scatter(x='city08', y='cylinders', alpha=.1, ax=ax)
fig.savefig('/tmp/c5-scatpan-cyl.png', dpi=300, bbox_inches='tight')

In [None]:
fueleco.cylinders.isna().sum()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
(fueleco
 .assign(cylinders=fueleco.cylinders.fillna(0))
 .plot.scatter(x='city08', y='cylinders', alpha=.1, ax=ax))
fig.savefig('/tmp/c5-scatpan-cyl0.png', dpi=300, bbox_inches='tight')

In [None]:
res = sns.lmplot(x='city08', y='highway08', data=fueleco)
res.fig.savefig('/tmp/c5-lmplot.png', dpi=300, bbox_inches='tight')

### How it works...

In [None]:
fueleco.city08.corr(fueleco.highway08*2)

In [None]:
fueleco.city08.cov(fueleco.highway08*2)

### There's more...

In [None]:
res = sns.relplot(x='city08', y='highway08',
   data=fueleco.assign(
       cylinders=fueleco.cylinders.fillna(0)),
   hue='year', size='barrels08', alpha=.5, height=8)
res.fig.savefig('/tmp/c5-relplot2.png', dpi=300, bbox_inches='tight')

In [None]:
res = sns.relplot(x='city08', y='highway08',
  data=fueleco.assign(
  cylinders=fueleco.cylinders.fillna(0)),
  hue='year', size='barrels08', alpha=.5, height=8,
  col='make', col_order=['Ford', 'Tesla'])
res.fig.savefig('/tmp/c5-relplot3.png', dpi=300, bbox_inches='tight')

In [None]:
fueleco.city08.corr(fueleco.barrels08, method='spearman')

## Comparing Categorical and Categorical Values

### How to do it...

In [None]:
def generalize(ser, match_name, default):
    seen = None
    for match, name in match_name:
        mask = ser.str.contains(match)
        if seen is None:
            seen = mask
        else:
            seen |= mask
        ser = ser.where(~mask, name)
    ser = ser.where(seen, default)
    return ser

In [None]:
makes = ['Ford', 'Tesla', 'BMW', 'Toyota']
data = (fueleco
   [fueleco.make.isin(makes)]
   .assign(SClass=lambda df_: generalize(df_.VClass,
    [('Seaters', 'Car'), ('Car', 'Car'), ('Utility', 'SUV'),
     ('Truck', 'Truck'), ('Van', 'Van'), ('van', 'Van'),
     ('Wagon', 'Wagon')], 'other'))
)

In [None]:
data.groupby(['make', 'SClass']).size().unstack()

In [None]:
pd.crosstab(data.make, data.SClass)

In [None]:
pd.crosstab([data.year, data.make], [data.SClass, data.VClass])

In [None]:
import scipy.stats as ss
import numpy as np
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
cramers_v(data.make, data.SClass)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
(data
 .pipe(lambda df_: pd.crosstab(df_.make, df_.SClass))
 .plot.bar(ax=ax)
)
fig.savefig('/tmp/c5-bar.png', dpi=300, bbox_inches='tight')

In [None]:
res = sns.catplot(kind='count',
   x='make', hue='SClass', data=data)
res.fig.savefig('/tmp/c5-barsns.png', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
(data
 .pipe(lambda df_: pd.crosstab(df_.make, df_.SClass))
 .pipe(lambda df_: df_.div(df_.sum(axis=1), axis=0))
 .plot.bar(stacked=True, ax=ax)
)
fig.savefig('/tmp/c5-barstacked.png', dpi=300, bbox_inches='tight')

### How it works...

In [None]:
cramers_v(data.make, data.trany)

In [None]:
cramers_v(data.make, data.model)

## Using the Pandas Profiling Library

### How to do it...

In [None]:
import pandas_profiling as pp
pp.ProfileReport(fueleco)

### How it works...

In [None]:
report = pp.ProfileReport(fueleco)
report.to_file('/tmp/fuel.html')