In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

# Purpose of this notebook

Organzie the drawing method(API) in famous Python libraries for data visualization, such as `matplotlib`, `pandas` and `seaborn`, and the usage of each type of graph.

reference:
- [Matplolib official tutorial for Artists](https://matplotlib.org/users/artists.html)
- [Pandas official tutorial for Visualization](https://pandas.pydata.org/pandas-docs/stable/visualization.html)

# Dataset
**Iris dataset** has four features - 
- sepal_length
- sepal_width
- petal_length
- petal_width

and the classification result - **species**

In [4]:
iris_df = pd.read_csv('../input/iris-dataset/iris.data.csv')
iris_df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris_df.species.astype('category')
iris_df.head()

In [5]:
iris_df.species.value_counts()

To create multilevel categorical data, add size as new column

In [7]:
specie_speal_mean = iris_df[["species", "sepal_length"]].groupby("species").mean()
specie_speal_var = iris_df[["species", "sepal_length"]].groupby("species").var()

In [20]:
flower_size = pd.Series(index=iris_df.index)
for species in ["Iris-virginica", "Iris-versicolor", "Iris-setosa"]:
    species_mask = iris_df.species == species
    flower_size[species_mask] = iris_df.loc[species_mask, "sepal_length"] - specie_speal_mean.loc[species, "sepal_length"]
    flower_size[species_mask] /= specie_speal_var.loc[species, "sepal_length"]

In [23]:
iris_df["flower_size"] = pd.Series(index=iris_df.index)
iris_df.loc[(flower_size > 1.5), "flower_size"] = "L"
iris_df.loc[(flower_size < 1.5) & (flower_size > -1.5), "flower_size"] = "M"
iris_df.loc[(flower_size < -1.5), "flower_size"] = "S"
iris_df["flower_size"].astype("category")
iris_df.head()

# Libraries

## matplotlib

Matplotlib is an *object-oriented* drawing library, which means the figure is composed by several objects, such as axis, label, line, grid and etc. Matplotlib also provide an easy-to-use Matlab-liked module *pyplot*, which would automatically construct required figure object.

Matplotlib has three core layers:
- **`backend_bases.FigureCanvas`**: the canvas for drawing
- **`backend_bases.Renderer`**: desribe how to draw something on `FigureCanvas`
- **`artist.Artist`**: describe some objects, such as axis, line, rectangle, which tell `Renderer` what should be draw on `FigureCanvas`

`Artist` can be diveded to two category:
- **primitives**: basic component, such as `Line2D`, `Rectangle`, `Text` ....
- **container**: the place to put primitives, such as `Figure`(whole drawing area), `Axes`(a graph). Containers may also contain some default primitives. For example `Axis` contain some `Text` and `Line` objects

Hence, the basic step for drawing is:
1. initialize `Figure`
2. initialize `Axes`
3. draw or set something by or to the `Axes`

## matplotlib.pyplot

**pyplot** is the object in order to execute some matplotlib function conveniently, hence
```python
plt.title('Title') # set title to the axes
```
**artist object** is the object to describe what to draw in the canvas, hence
```python
axes.title # get the 'title' element of axes, which is instance of Text Class
axes.title.set_text('Title') # set text to the 'title' element of axes
axes.set_title('Title') # set text to the 'title' element of axes
```

In [5]:
# initialize a Figure with figure size (20, 5), 
# if ignore this line, pyplot would automatically initialize a default figure
fig = plt.figure(figsize=(8, 2))
print('Figure:', fig)

# initialize a Axes, located in the 1st position of the "current figure" split to 1 row x 2 cols 
# and set current axes to this Axes
ax1 = plt.subplot(1, 2, 1)
print('Axes1:', ax1)
# get current axes by gca(get current axes)
print('Current Axes is ax1:', plt.gca() == ax1)

# draw line on the "current axes"
# plt.sca(ax1): use "sca"(set current axis) to plot on other axes
plt.plot([0, 1], [0, 1], 'r-')

# directly use Figure object to initialize new Axes
ax2 = fig.add_subplot(1, 2, 2)
print('Axes2:', ax2)
print('Current Axes is ax2:', plt.gca() == ax2)

# directly draw by Axes
ax2.plot([0, 1], [1, 0], 'b--')

plt.show()

## pandas.DataFrame.plot

`pandas.DataFrame.plot` is the simplified wrapper for `matplotlib.pyplot`. `pandas.Dataframe.plot` is more convinent to plot several columns data in a single graph.
```python
pandas.DataFrame.plot(
    x=None, # the column name or position of x data, None means use index as x. Must be set in 'scatter', 'hexbin'
    y=None, # the column name or position of y data. None means use all column. Must be set in 'scatter', 'hexbin' and 'pie'
    kind='line', # See Below
    ax=None, # Axes object
    subplots=False, # Make separate subplots for each column
    figsize=(w, h), # plt.figure(figsize=(w, h))
    use_index=True, # Use index as xticks
    title='title',  # plt.title('title') = axe.set_title('title')
    grid=False,  # plt.grid() = axe.grid()
    legend=True, # plt.legend() = axe.legend()
    logx=False,  # plt.semilogx(x, y) = axe.semilogx(x, y)
    logy=False, # plt.semilogy(x, y) = axe.semilogy(x, y)
    loglog=False, # plt.logog(x, y) = axe.loglog(x, y) both x, y axis is log
    xticks=[], # plt.xticks([]) = axe.set_xticks([])
    yticks=[], # plt.yticks([]) = axe.set_yticks([])
    xlim=(x_min, x_max), # plt.xlim(x_min, x_max) = axes.xlim(x_min, x_max)
    ylim=(y_min, y_max), # plt.xlim(y_min, y_max) = axes.xlim(y_min, y_max)
)
```

Example to choose different kind of graph
```python
# line(default)
df.plot(kind='line') = plt.plot(df.index, df.columns) = axe.plot(df.indx, df.columns)
# hist
df.plot(kind='hist') = df.plot.hist() = plt.hist(df.index, df.columns) = axe.hist(df.indx, df.columns)
```

In [6]:
plt.figure(figsize=(20, 5))
# pandas
ax1 = plt.subplot(121)
iris_df.plot(ax=ax1, title='plot by pandas.DataFrame.plot')
# matplotlib
ax2 = plt.subplot(122)
ax2.plot(iris_df.index, iris_df.iloc[:,:4])
ax2.set_title('plot by matplotlib')
ax2.legend(iris_df.columns)

plt.show()

## pandas.plotting

## seaborn

**seaborn** is another visualization library providing high-level api to use matplotlib. seaborn is userful for category data and could be used to draw more complex graph with some analytic result.

# Graph
There are two type of features(variables):
- **continuous**: cost, distance, age
- **categorical**: species, age bracket

we can divide graph to several types depends on the usage:
- For data analysis
    - **univarite**: reveal the property of single continuous/discrete variable. ex. the distribution of the length of sepal
    - **bivariate  with one categorical variable**: reveal the property of single variable corresponding to some categories
    - **bivariate**: reveal the relationship between two variables
    - **trivariate with one categorical variable**:
    - **trivariate with two categorical variables**:
    - **multivariate**:
    - **time sequence**: the varible varies depends on timestamp or progress
- For model analysis
    - **confusion matrix**: 

## Univariate

### Distribution
plot the approximate distribution(by gaussian kernel) of specific variables
- `df.DataFrame.plot(kind='kde', y='')`
- `df.DataFrame.plot(kind='distribution', y='')`: the same as above
- `df.DataFrame.plot.kde(y='')`
- `sns.kdeplot(kernel='gau')`: could set more details about approximation method
- `sns.displot(kde=True, hist=True, rug=False)`: combine `kdeplot`, `histplot` and `rugplot`

In [7]:
plt.figure(figsize=(20, 5))

ax1 = plt.subplot(121)
ax1.set_title('df.DataFrame.plot.kde')
iris_df.plot.kde(y='sepal_length', ax=ax1)

ax2 = plt.subplot(122)
ax2.set_title('sns.distplot')
sns.distplot(iris_df['sepal_length'], rug=True, ax=ax2)

### Histogram
histogram is another method to present the distribution of data, which divides data to several intervals or bins.
- `plt.hist(histtype='bar')`
- `matplotlib.axes.Axes.hist(histtype='bar')`
- `pd.DataFrame.plot(type='hist', y='')`
- `pd.DataFrame.plot.hist(y='')`
- `pd.DataFrame.hist(column='')`
- draw the data by `np.histogram`: useful when combining mutiple data set

In [8]:
plt.figure(figsize=(20, 10))
ax1 = plt.subplot(321)
n_in_bins, bins_loc, _ = ax1.hist(x=iris_df['sepal_length'], bins=15, label=['sepal_length'], histtype='bar')
ax1.set_xticks(bins_loc)
ax1.legend()
ax1.set_title('plt.hist(histtype=\'bar\')')

ax2 = plt.subplot(322)
_, bins_loc, _ = ax2.hist(x=iris_df['sepal_length'], bins=15, label=['sepal_length'], histtype='step')
ax2.legend()
ax2.set_title('plt.hist(histtype=\'step\')')

ax3 = plt.subplot(323)
n_in_bins, bins_loc = np.histogram(iris_df['sepal_length'], bins=15)
plt.bar(bins_loc[:-1], n_in_bins, width=bins_loc[1] - bins_loc[0], align='edge', label='sepal_length')
plt.xticks(bins_loc)
plt.legend()
plt.title('np.histogram & plt.bar')

ax4 = plt.subplot(324)
plt.step(bins_loc[:-1], n_in_bins, where='post', label='sepal_length')
plt.xticks(bins_loc)
plt.legend()
plt.title('np.histogram & plt.step')

### count

In [9]:
sns.countplot(x='species', data=iris_df)

### Box
Box graph reveals the quartiles（四分位數） of the data. 
1. The box extends from Q1 to Q3, with a line at the median(Q2).
2. The top and bottom line stands for the approximate range of data, calcuated by Q1 and Q3. ([reference](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.boxplot.html))
3. The data beyond top or below bottom line as represented by circle marker are outliners.


- `plt.boxplot()`
- `matplotlib.axes.Axe.boxplot()`
- `pd.DataFrame.plot(kind='box')`
- `pd.DataFrame.plot.box()`
- `pd.DataFrame.boxplot()`
- `sns.boxplot()`

In [10]:
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(221)
ax1.set_title('plt.boxplot')
plt.boxplot(iris_df.values[:, :-1], labels=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

ax2 = plt.subplot(222)
ax2.set_title('df.DataFrame.plot.box')
iris_df.plot.box(ax=ax2)

## Bivariate with One Categorical Variable

### Distribution

In [11]:
plt.figure(figsize=(10, 5))
ax = plt.gca()
plt.xlabel('sepal_length')
for specie, data in iris_df.groupby('species'):
    data.plot.kde(y='sepal_length', label=specie, ax=ax)

### Histogram

In [12]:
plt.figure(figsize=(20, 5))
ax = plt.gca()
iris_df.hist(column='sepal_length', by='species', alpha=0.5, sharex=True, layout=(1, 3), ax=ax)
plt.show()

In [13]:
plt.figure(figsize=(10, 5))
ax = plt.gca()
for specie, data in iris_df.groupby('species'):
    data.hist(column='sepal_length', alpha=0.5, label=specie, ax=ax)
ax.legend()

### Box

In [14]:
plt.figure(figsize=(10, 5))
ax = plt.gca()
sns.boxplot(x='species', y='sepal_length', data=iris_df, ax=ax)

### Bar

In [15]:
plt.figure(figsize=(10, 5))
ax = plt.gca()
sns.barplot(x='species', y='sepal_length', data=iris_df, ax=ax)

### Others
- `stripplot`:
- `swarmplot`:
- `violinplot`:
- `lvplot`: box indicates to quartiles, octiles and so on

In [16]:
plt.figure(figsize=(20, 12))
ax1 = plt.subplot(221)
ax1.set_title('sns.stripplot')
sns.stripplot(x='species', y='sepal_length', data=iris_df, ax=ax1)

ax2 = plt.subplot(222)
ax2.set_title('sns.swarmplot')
sns.swarmplot(x='species', y='sepal_length', data=iris_df, ax=ax2)

ax3 = plt.subplot(223)
ax3.set_title('sns.violinplot')
sns.violinplot(x='species', y='sepal_length', data=iris_df, ax=ax3)

ax4 = plt.subplot(224)
ax4.set_title('sns.lvplot')
sns.lvplot(x='species', y='sepal_length', data=iris_df, ax=ax4)

## Bivariate

### Scatter
- reveal the relationship between two features

In [17]:
plt.figure(figsize=(10, 4))
ax = plt.gca()
iris_df.plot.scatter(x='sepal_length', y='sepal_width', ax=ax)

In [18]:
plt.figure(figsize=(10, 4))
ax = plt.gca()
ax.set_xlim(4, 8)
ax.set_ylim(2, 4.5)
sns.regplot(x='sepal_length', y='sepal_width', data=iris_df[iris_df.species == 'Iris-setosa'], label='Iris-setosa', color='red', ax=ax)
ax.legend()

In [19]:
sns.jointplot(x='sepal_length', y='sepal_width', data=iris_df)

### distribution

In [20]:
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(121)
sns.kdeplot(iris_df['petal_length'], iris_df['petal_width'], cbar=True, ax=ax1)

ax2 = plt.subplot(122)
sns.kdeplot(iris_df['petal_length'], iris_df['petal_width'], shade=True, cbar=True, ax=ax2)

### Correlation matrix

In [21]:
sns.heatmap(iris_df.corr(), annot=True, cmap='YlGnBu')

In [22]:
from pandas.plotting import scatter_matrix
plt.figure(figsize=(10, 10))
ax = plt.gca()
_ = scatter_matrix(iris_df, ax=ax)

## Trivariate with One Categorical Variable

### Scatter

In [23]:
plt.figure(figsize=(20, 4))

color_dict = {
    'Iris-versicolor': 'blue',
    'Iris-virginica': 'yellow',
    'Iris-setosa': 'red',
}

# pandas
ax1 = plt.subplot(121)
for specie, color in color_dict.items():
    iris_df[iris_df.species == specie].plot.scatter(x='sepal_length', y='sepal_width', ax=ax1, color=color, label=specie)
    
# matplotlib
ax2 = plt.subplot(122)
for specie, color in color_dict.items():
    specie_df = iris_df[iris_df.species == specie]
    ax2.scatter(specie_df.loc[:, 'sepal_length'], specie_df.loc[:, 'sepal_width'], color=color, label=specie)
ax2.legend()
ax2.set_xlabel('sepal_length')
ax2.set_ylabel('sepal_width')

plt.show()

In [24]:
sns.lmplot(x='sepal_length', y='sepal_width', hue='species', data=iris_df)

In [25]:
sns.lmplot(x='sepal_length', y='sepal_width', hue='species', col='species', data=iris_df)

In [26]:
sns.pairplot(iris_df, hue="species")

## pointplot
pointplot is used to reveal the relationship when there are multi-level categorical data

In [25]:
sns.pointplot(x="flower_size", y="sepal_length", hue='species', data=iris_df, order=["S", "M", "L"])