# Data Loading

In [None]:
import numpy as np
import pandas as pd

In [None]:
try:
    import google.colab
    data_file = 'https://raw.githubusercontent.com/sesise0307/pydata2021-eda/main/data/preprocessed/adult.feather'
except:
    data_file = '../data/preprocessed/adult.feather'

df = pd.read_feather(data_file)

In [None]:
df.head()

# Matplotlib

![Matplotlib](../image/matplotlib.svg)

- [GitHub](https://github.com/matplotlib/matplotlib)
- [Documentation](https://matplotlib.org/stable/contents.html)

Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python.

It is a basic building block of other advanced Python visualization libraries such as `seaborn` or `pandas`.

So, it is essential to know `matplotlib` if you're using Python for data analysis.

In this tutorial, I assume that you already have some experiences with `matplotlib`.

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.rcParams['figure.figsize'] = 10, 5  # Set default figure size

In [None]:
plt.rcParams

## Scatter Plot

In [None]:
plt.scatter(x=df['hours_per_week'], y=df['fake_income'])

In [None]:
# Let's add x and y lables with units
plt.scatter(x=df['hours_per_week'], y=df['fake_income'])

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]');

In [None]:
# Let's adjust point size and add color with "age"
# Available colormaps can be found here: https://matplotlib.org/stable/tutorials/colors/colormaps.html

plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]');

In [None]:
# Let's add a color bar
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar();

In [None]:
# Let's add a line indicating median "fake_income"
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar()

median_fake_income = df['fake_income'].median()
plt.axhline(y=median_fake_income, color='r', ls='--');

In [None]:
# Let's add a label for the median "fake_income"
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar()

median_fake_income = df['fake_income'].median()
plt.axhline(y=median_fake_income, color='r', ls='--',
            label=f'Median Fake Income: ${median_fake_income:,.0f}')
plt.legend();

In [None]:
# How about average income by "hours_per_week" bins?
hpw_bins = np.arange(0, 101, 5)
hpw_labels = hpw_bins[:-1] + 2.5
df['hpw_cut'] = pd.cut(df['hours_per_week'], bins=hpw_bins, labels=hpw_labels)

df[['hours_per_week', 'hpw_cut']].head()

In [None]:
average_income_by_hours_per_week = (
    df
    .groupby('hpw_cut')['fake_income']
    .mean()
)

average_income_by_hours_per_week.head()

In [None]:
plt.plot(
    average_income_by_hours_per_week.index,
    average_income_by_hours_per_week,
    marker='x'
)

In [None]:
# Let's plot the two figures altogether
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar()

median_fake_income = df['fake_income'].median()
plt.axhline(y=median_fake_income, color='r', ls='--',
            label=f'Median fake income: ${median_fake_income:,.0f}')

plt.plot(
    average_income_by_hours_per_week.index,
    average_income_by_hours_per_week,
    marker='X',
    color='orange',
    lw=2,
    label='Avg. Fake Income by Hours per Week Bins',
)

plt.legend();

## Histogram

In [None]:
plt.hist(df['fake_income'])

In [None]:
?plt.hist

In [None]:
# Attention! Histogram is heavily affected by selection of bins
plt.figure(figsize=(20, 5))

plt.subplot(131)
plt.title('Default bins')
plt.hist(df['fake_income'])

plt.subplot(132)
plt.title('With bin size 5,000')
plt.hist(df['fake_income'], bins=np.arange(0, 400000, 5000))

plt.subplot(133)
plt.title('With bin size 100')
plt.hist(df['fake_income'], bins=np.arange(0, 400000, 100));

In [None]:
# Hours per Week by Income Level
plt.hist(df.query('income == "<=50K"')['hours_per_week'],
         bins=hpw_bins, label='Income <= 50K', density=True, alpha=0.7)
plt.hist(df.query('income == ">50K"')['hours_per_week'],
         bins=hpw_bins, label='Income > 50K', density=True, alpha=0.7)

plt.legend();

In [None]:
# It's difficult to compare more than two categories with a histogram
for race in df['race'].unique():
    plt.hist(df.query('race == @race')['hours_per_week'],
             bins=hpw_bins, label=f'{race}', density=True, alpha=0.7)

plt.legend();

Solution?
- KDE (Kernel Density Estimation)
- ECDF (Emperical Cumulative Density Function)

Will be tackled with `seaborn` below.

## Boxplot

In [None]:
plt.boxplot(df['fake_income']);

In [None]:
plt.boxplot(df['age']);

What if we want to plot multiple boxplots of "fake_income" by "age_group"?

Matplotlib requires some prior processings of the data for that.

In [None]:
age_groups = sorted(df['age_group'].unique())

fake_incomes_by_age_group = [
    df.query('age_group == @age_group')['fake_income']
    for age_group in age_groups
]

print(age_groups[0])
fake_incomes_by_age_group[0].head()

In [None]:
plt.boxplot(fake_incomes_by_age_group, labels=age_groups);

Do we have a easier way than this?

Yes! With `pandas` and `seaborn`. 

Will be introduced below.

# Pandas

![Pandas](../image/pandas.svg)

- [GitHub](https://github.com/pandas-dev/pandas)
- [Documentation](https://pandas.pydata.org/docs/index.html)
- [Visualization User Guide](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html)

`pandas` is a Python package that provides fast, flexible, and expressive data structures
designed to make working with "relational" or "labeled" data both easy and intuitive.
It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python.

> `pandas` is used mainly for manipulating DataFrame, but it also supports handy methods for creating decent looking plots with one line of code.

## Data Manipulation + Plotting

In [None]:
# Pure Matplotlib
average_income_by_hours_per_week = (
    df
    .groupby('hpw_cut')['fake_income']
    .mean()
)

plt.plot(
    average_income_by_hours_per_week.index,
    average_income_by_hours_per_week,
    marker='x'
)

In [None]:
# With Pandas
(
    df
    .groupby('hpw_cut')
    ['fake_income']
    .mean()
#     .plot(marker='x')
)

Supported Plot Type:

![Pandas Plot Kind](../image/pandas_plot_kind.png)

Source: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html

In [None]:
# The same data with a bar chart
(
    df
    .groupby('hpw_cut')
    ['fake_income']
    .mean()
    .plot(kind='bar')
);

In [None]:
# The other way to plot a bar chart
(
    df
    .groupby('hpw_cut')
    ['fake_income']
    .mean()
    .plot
    .bar()
);

## Scatter Plot

In [None]:
# Pure Matplotlib
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar();

In [None]:
# Pandas version
df.plot.scatter(x='hours_per_week', y='fake_income',
                c='age', cmap='winter', s=5);

x-ticks does not appear with Pandas.

It seems it's Pandas' bug according to [this](https://stackoverflow.com/questions/43121584/matplotlib-scatterplot-x-axis-labels).

Work around is:

In [None]:
fig, ax = plt.subplots()
df.plot.scatter(x='hours_per_week', y='fake_income',
                c='age', cmap='winter', s=5, ax=ax);

In [None]:
fig, ax = plt.subplots()
df.plot.scatter(x='hours_per_week', y='fake_income',
                c='age', cmap='winter', s=5, ax=ax)

# Fine tuning with Matplotlib
plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]');

## Histogram

In [None]:
# Hours per Week by Income Level: Pure Matplotlib
plt.hist(df.query('income == "<=50K"')['hours_per_week'],
         bins=hpw_bins, label='Income <= 50K', density=True, alpha=0.7)
plt.hist(df.query('income == ">50K"')['hours_per_week'],
         bins=hpw_bins, label='Income > 50K', density=True, alpha=0.7)

plt.legend();

In [None]:
(
    df
    .groupby('income')['hours_per_week']
    .plot
    .hist(bins=hpw_bins, density=True, alpha=0.7)
);

In [None]:
(
    df
    .groupby('income')['hours_per_week']
    .plot
    .hist(bins=hpw_bins, density=True, alpha=0.7)
)

# Fine tuning with Matplotlib
plt.legend();

## Boxplot

In [None]:
# Pure Matplotlib
age_groups = sorted(df['age_group'].unique())

fake_incomes_by_age_group = [
    df.query('age_group == @age_group')['fake_income']
    for age_group in age_groups
]

plt.boxplot(fake_incomes_by_age_group, labels=age_groups);

In [None]:
# Pandas version
df.boxplot(column='fake_income', by='age_group');

# Seaborn

![Seaborn](../image/seaborn.svg)

- [GitHub](https://github.com/mwaskom/seaborn)
- [Documentation](https://seaborn.pydata.org/index.html)

`Seaborn` is a library for making statistical graphics in Python.

It builds on top of `matplotlib` and integrates closely with `pandas` data structures.

`Seaborn` helps you explore and understand your data.
Its plotting functions operate on `dataframes` and `arrays` containing whole datasets
and internally perform the necessary semantic mapping and statistical aggregation to produce informative plots.
Its dataset-oriented, declarative API lets you focus on what the different elements of your plots mean,
rather than on the details of how to draw them.

> This is my go-to library for starting data visualizations.

[Example Gallery](https://seaborn.pydata.org/examples/index.html)

![Example Gallery](../image/seaborn_gallery.png)

## [Figure Style](https://seaborn.pydata.org/tutorial/aesthetics.html#seaborn-figure-styles)

![Seaborn Figure Syltes](../image/seaborn_figure_styles.png)

> Seaborn style has an impact on Matplotlib figures too.
>
> Acutally, it modifies Matplotlib's default figure parameters for styling.

In [None]:
import seaborn as sns

In [None]:
styles = ['default', 'darkgrid', 'whitegrid', 'dark', 'white', 'ticks']

for style in styles:
    if style == 'default':
        sns.reset_orig()  # Restore Matplotlib default
    else:
        sns.set_style(style)
        
    plt.figure(figsize=(8, 2))
    plt.title(f'{style}', fontsize=20)
    plt.hist(df['fake_income'], bins=range(0, 400000, 10000))

In [None]:
# I personally prefer the "whitegrid" style
sns.set_style('whitegrid')

## Functions

![Seaborn Functions](../image/seaborn_functions.png)

- Figure-level functions: `relplot`, `distplot`, `catplot`
  - Interface with `matplotlib` through a seaborn object, usually a `FacetGrid`, that manages the figure
- Axis-level functions: `scatterplot`, `lineplot`, `histplot`, `kdeplot`, ...
  - Plot data onto a single `matplotlib.pyplot.Axes` object

In [None]:
fig, axes = plt.subplots(1, 2)

plt.suptitle('This is "Figure"', fontsize=20)
axes[0].set_title('This is "ax_0"')
axes[1].set_title('This is "ax_1"');

In [None]:
# Functions within a module share a lot of underlying code and offer similar features.
# They are designed to facilitate switching between different visual representations as you explore a dataset.
# For instance:

plt.subplot(121)
sns.histplot(data=df, x='age', hue='income', multiple="stack");

plt.subplot(122)
sns.kdeplot(data=df, x='age', hue='income', multiple="stack");

## relplot

### scatterplot

In [None]:
# Pure Matplotlib
plt.scatter(x=df['hours_per_week'], y=df['fake_income'],
            c=df['age'], cmap='winter', s=5)

plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]')

plt.colorbar();

In [None]:
sns.scatterplot(data=df, x='hours_per_week', y='fake_income',
                c=df['age'], cmap='winter', s=5);

In [None]:
# Seaborn scatterplot is more suitable for representing
# categorical variables with hue and style
sns.scatterplot(data=df, x='hours_per_week', y='fake_income',
                hue='race', style='sex', s=30, alpha=0.7);

In [None]:
sns.scatterplot(data=df, x='hours_per_week', y='fake_income',
                hue='race', style='sex', s=30, alpha=0.7);

# We can always fine-tune the figure with Matplotlib
plt.xlabel('Hours per Week [h]')
plt.ylabel('Fake Income [dollar]');

### lineplot

In [None]:
sns.lineplot(data=df, x='hours_per_week', y='fake_income',
             hue='race', style='sex');

In [None]:
sns.lineplot(data=df, x='hpw_cut', y='fake_income',
             hue='race', style='sex');

In [None]:
sns.lineplot(data=df, x='age_group', y='fake_income',
             hue='race', style='sex');

### relplot (Figure-level)

In [None]:
# sns.lineplot(data=df, x='age_group', y='fake_income',
#              hue='race', style='sex');
sns.relplot(data=df, x='age_group', y='fake_income',
            hue='race', kind='line', col='sex');

In [None]:
# Hue by sex and col by race
sns.relplot(data=df, x='age_group', y='fake_income',
            hue='sex', kind='line', col='race', height=3);


In [None]:
g = sns.relplot(data=df, x='age_group', y='fake_income',
            hue='sex', kind='line', col='race', height=3)

# Fine tuning with FacetGrid methods
g.set_xticklabels(rotation=30);

In [None]:
# Adjust col_wrap
sns.relplot(data=df, x='age_group', y='fake_income',
            hue='sex', kind='line', col='race', height=5, col_wrap=3);

In [None]:
sns.relplot(data=df, x='age_group', y='fake_income',
            kind='line', col='race', row='sex');

In [None]:
# To scatterplot
sns.relplot(data=df, x='age_group', y='fake_income',
            kind='scatter', col='race', row='sex');

In [None]:
# x='age' instead of 'age_group'
sns.relplot(data=df, x='age', y='fake_income',
            kind='scatter', col='race', row='sex');

In [None]:
sns.relplot(data=df, x='age', y='fake_income',
            kind='scatter', col='race', hue='sex', row='income', height=3.5);

## displot

### histplot

In [None]:
# Hours per Week by Income Level: Pure Matplotlib
plt.hist(df.query('income == "<=50K"')['hours_per_week'],
         bins=hpw_bins, label='Income <= 50K', density=True, alpha=0.7)
plt.hist(df.query('income == ">50K"')['hours_per_week'],
         bins=hpw_bins, label='Income > 50K', density=True, alpha=0.7)

plt.legend();

In [None]:
# Histogram with Seaborn
sns.histplot(data=df, x='hours_per_week', hue='income', bins=hpw_bins,
             stat='density', alpha=0.7, common_norm=False);

### kdeplot

In [None]:
sns.kdeplot(data=df, x='hours_per_week', hue='income');

In [None]:
# Plot hist and kde altogether
sns.histplot(data=df, x='hours_per_week', hue='income', bins=hpw_bins,
             stat='density', alpha=0.7, common_norm=False)

sns.kdeplot(data=df, x='hours_per_week', hue='income');

In [None]:
# Bins should be adjusted as well as "bw_adjust" of the KDE plot
sns.histplot(data=df, x='hours_per_week', hue='income', bins=hpw_bins - 2.5,
             stat='density', alpha=0.7, common_norm=False)

sns.kdeplot(data=df, x='hours_per_week', hue='income', bw_adjust=0.7);

### ecdfplot

In [None]:
sns.ecdfplot(data=df, x='hours_per_week', hue='income');

In [None]:
plt.figure(figsize=(20, 5))

plt.subplot(131)
plt.title('Difficult to compare\nmany categories with a histogram', fontsize=15)
sns.histplot(data=df, x='hours_per_week', hue='race', bins=hpw_bins,
             stat='density', alpha=0.7, common_norm=False)

plt.subplot(132)
plt.title('A bit better by giving "multiple=fill"', fontsize=15)
sns.histplot(data=df, x='hours_per_week', hue='race', bins=hpw_bins,
             stat='density', alpha=0.7, common_norm=False,
             multiple='fill')  # Modify multiple option to 'fill'

plt.subplot(133)
plt.title('ecdfplot is the best for a comparison', fontsize=15)
sns.ecdfplot(data=df, x='hours_per_week', hue='race');

### displot (Figure-level)

In [None]:
# sns.histplot(data=df, x='hours_per_week', hue='income', bins=hpw_bins,
#              stat='density', alpha=0.7, common_norm=False);
sns.displot(data=df, x='hours_per_week', hue='income', bins=hpw_bins,
             stat='density', alpha=0.7, common_norm=False, kind='hist');

In [None]:
sns.displot(data=df, x='hours_per_week', hue='income', bins=hpw_bins,
            stat='density', alpha=0.7, common_norm=False, kind='hist',
            col='race', col_wrap=3, height=3);

## catplot

### boxplot

In [None]:
# Pure Matplotlib
age_groups = sorted(df['age_group'].unique())

fake_incomes_by_age_group = [
    df.query('age_group == @age_group')['fake_income']
    for age_group in age_groups
]

plt.boxplot(fake_incomes_by_age_group, labels=age_groups);

In [None]:
# Pandas version
df.boxplot(column='fake_income', by='age_group');

In [None]:
# Seaborn version
sns.boxplot(data=df, x='age_group', y='fake_income');

In [None]:
# Seaborn version (+ coloring by "sex")
sns.boxplot(data=df, x='age_group', y='fake_income', hue='sex');

### violinplot

In [None]:
sns.violinplot(data=df, x='age_group', y='fake_income', hue='sex');

In [None]:
sns.violinplot(data=df, x='age_group', y='fake_income', hue='sex',
               split=True);

### boxenplot

In [None]:
sns.boxenplot(data=df, x='age_group', y='fake_income', hue='sex');

### pointplot

In [None]:
sns.pointplot(data=df, x='age_group', y='fake_income', hue='sex');

### barplot

In [None]:
sns.barplot(data=df, x='age_group', y='fake_income', hue='sex');

### striplot

In [None]:
sns.stripplot(data=df, x='age', y='education', hue='sex',
              dodge=True, alpha=0.3, size=3);

In [None]:
sns.stripplot(data=df, x='fake_income', y='education', hue='sex',
              dodge=True, alpha=0.3, size=3);

In [None]:
sns.stripplot(data=df, x='fake_income', y='race', hue='sex',
              dodge=True, alpha=0.3, size=3);

### catplot (Figure-level)

In [None]:
# sns.boxplot(data=df, x='age_group', y='fake_income', hue='sex')

sns.catplot(data=df, x='age_group', y='fake_income', hue='sex', kind='box');

In [None]:
sns.catplot(data=df, x='age_group', y='fake_income', hue='sex', kind='box',
            col='race', col_wrap=3, height=4);

## Heatmap

In [None]:
# Average fake_income by education and race?

avg_fake_income = df.pivot_table(values='fake_income',
                                 index='education',
                                 columns='race',
                                 aggfunc='mean',)
avg_fake_income

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(avg_fake_income, cmap='Blues', annot=True, fmt='.0f');

In [None]:
count_fake_income = df.pivot_table(values='fake_income',
                                 index='education',
                                 columns='race',
                                 aggfunc='count',)
count_fake_income

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(count_fake_income, cmap='Blues', annot=True, fmt='.0f');

## Joint Plot

Draw a plot of two variables with bivariate and univariate graphs.

Suppored kinds are { “scatter” | “kde” | “hist” | “hex” | “reg” | “resid” }.

In [None]:
sns.jointplot(data=df, x='age', y='fake_income', hue='sex', alpha=0.3, s=10);

In [None]:
sns.jointplot(data=df, x='age', y='fake_income', kind='hist');

## Pair Plot

Plot pairwise relationships in a dataset.

Supported `kind`s are {‘scatter’, ‘kde’, ‘hist’, ‘reg’}.

Supported `diag_kind`s are {‘auto’, ‘hist’, ‘kde’, None}.

In [None]:
sns.pairplot(
    data=df,
    hue='sex',
    vars=['age', 'capital_gain', 'hours_per_week', 'fake_income'],
    plot_kws=dict(alpha=0.3, s=10),
);