In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Agenda
## Mastering Matplotlib
<ol>
    <li>
        <a href="#1">Visualization with Matplotlib</a>
    </li>
    <li>
        <a href="#2">Introducing the mechanism of action competition</a>        
    </li>
    <li>
        <a href="#3">How humans see data</a>        
    </li>
    <li>
        <a href="#5">Applied Matplotlib</a>        
    </li>
    <li>
        <a href="#6">Exercises</a>        
    </li>
</ol>

## Resources
[Python data science handbook](https://tanthiamhuat.files.wordpress.com/2018/04/pythondatasciencehandbook.pdf?fbclid=IwAR0Du6YVLl32yYexqQbgiHWu4AnwWgrAWdYLRMn-ZPnHa4HVf_sc4zEBSao)

[Matplotlib docs](https://matplotlib.org/)

[How humans see data](https://www.youtube.com/watch?v=fSgEeI2Xpdc&t=3s)

[Graphical perception and graphical methods for analyzing scientific data](http://snoid.sv.vt.edu/~npolys/projects/safas/1695272.pdf)

[Exercises](https://www.w3resource.com/graphics/matplotlib/)

[Further learning](https://github.com/rfordatascience/tidytuesday)


# Visualization with Matplotlib

A visualization library for matrix plotting built on Numpy and is the basis for plotting in Pandas and Seaborn.

Matplotlib has powerful internals that give you fine-tuned control of graphics.

In [None]:
plt.style.available

In [None]:
plt.style.use('seaborn-colorblind')

In [None]:
x = np.linspace(0, 10, 100)

fig = plt.figure()
plt.plot(x, np.sin(x), '-')
plt.plot(x, np.cos(x), '--');

## Two coding styles
- Stateful.
- Object-oriented.

### Stateful plotting
Using `matplotlib.pyplot` gives MATLAB-like tools that keep track of the current figure and axes.

In [None]:
plt.subplot(2, 1, 1)
plt.plot(x, np.sin(x))
plt.scatter(x[::10], np.sin(x[::10]), marker='x')

plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x));
plt.plot(x, -np.cos(x));

In [None]:
plt.gcf();

In [None]:
plt.gca();

### Object-oriented
This approach is better for keeping track of plots. 

There are two types of objects:
- `Figure`
- `Axes`

The `Axes` object uses methods to make plots on the `Figure`.

Get familiar with `plt.subplots()`.

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.arrow(x=0.2, y=0.7, dx=0.4, dy=-0.1, head_width=0.05, head_length=0.05)
ax.arrow(x=0.2, y=0.3, dx=0.4, dy=0.1, head_width=0.05, head_length=0.05)
ax.text(x=0.1, y=0.8, s='ax1', fontsize=16)
ax.text(x=0.1, y=0.2, s='ax2', fontsize=16)
ax.text(x=0.7, y=0.5, s='fig', fontsize=16)
ax.set_title('Axes and figures are "Many-to-One"', fontsize=24);

<div class="d-block text-center">
<div class="jumbotron">
    <h2>Idea --> Abstract patterns --> Functions --> Trial, error, docs..</h3
</div>
        </div>

## Line plots

Use `ax.plot`.

<div class="d-block py-5">
<div class="alert alert-info">
    <h3>Tip: use <code>np.arange</code> or <code>np.linspace</code> to quickly produce the domain, <code>x</code>.</h3>
</div>
    </div>

In [None]:
fig, ax = plt.subplots()
x = [50, 100]
y = [50, 25]
ax.plot(x, y)
ax.set_xlim(left=0)

Other Parameters
----------------

**kwargs : `.Line2D` properties, optional
    *kwargs* are used to specify properties like a line label (for
    auto legends), linewidth, antialiasing, marker face color.
    Example::

    >>> plot([1, 2, 3], [1, 2, 3], 'go-', label='line 1', linewidth=2)
    >>> plot([1, 2, 3], [1, 4, 9], 'rs', label='line 2')
---
_Scroll me_
<div style="overflow: scroll; max-height: 230px;">


    If you make multiple lines with one plot command, the kwargs
    apply to all those lines.

    Here is a list of available `.Line2D` properties:

    Properties:
    agg_filter: a filter function, which takes a (m, n, 3) float array and a dpi value, and returns a (m, n, 3) array
    alpha: float or None
    animated: bool
    antialiased or aa: bool
    clip_box: `.Bbox`
    clip_on: bool
    clip_path: Patch or (Path, Transform) or None
    color or c: color
    contains: callable
    dash_capstyle: {'butt', 'round', 'projecting'}
    dash_joinstyle: {'miter', 'round', 'bevel'}
    dashes: sequence of floats (on/off ink in points) or (None, None)
    data: (2, N) array or two 1D arrays
    drawstyle or ds: {'default', 'steps', 'steps-pre', 'steps-mid', 'steps-post'}, default: 'default'
    figure: `.Figure`
    fillstyle: {'full', 'left', 'right', 'bottom', 'top', 'none'}
    gid: str
    in_layout: bool
    label: object
    linestyle or ls: {'-', '--', '-.', ':', '', (offset, on-off-seq), ...}
    linewidth or lw: float
    marker: marker style
    markeredgecolor or mec: color
    markeredgewidth or mew: float
    markerfacecolor or mfc: color
    markerfacecoloralt or mfcalt: color
    markersize or ms: float
    markevery: None or int or (int, int) or slice or List[int] or float or (float, float)
    path_effects: `.AbstractPathEffect`
    picker: float or callable[[Artist, Event], Tuple[bool, dict]]
    pickradius: float
    rasterized: bool or None
    sketch_params: (scale: float, length: float, randomness: float)
    snap: bool or None
    solid_capstyle: {'butt', 'round', 'projecting'}
    solid_joinstyle: {'miter', 'round', 'bevel'}
    transform: `matplotlib.transforms.Transform`
    url: str
    visible: bool
    xdata: 1D array
    ydata: 1D array
    zorder: float
</div>

In [None]:
?mpl.axes._subplots.Axes.plot

In [None]:
fig

In [None]:
ax

In [None]:
dir(fig)

In [None]:
fig.axes


In [None]:
ax.clear()

In [None]:
ax.set_title('Updated plot')

In [None]:
ax.scatter(x, y, marker='x', color='red', alpha=0.9)
ax.plot(x, y, linestyle='--', color='chartreuse', linewidth=10, alpha=0.2, 
        dash_capstyle='round', label='My dashed line')
fig

In [None]:
ax.set_xlim(70, 80)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')

ax.legend()

In [None]:
fig

### Adding an Axes to an existing figure

`fig.add_subplot`

In [None]:
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot([0, 50], [0, 50])
fig

In [None]:
ax.clear()

In [None]:
fig.clear()

In [None]:
ax = fig.add_subplot(1, 1, 1)
ax.plot(x, y)
fig

In [None]:
ax2 = fig.add_subplot(2, 2, 2)
ax2.plot([0, 50], [0, 50])
fig

In [None]:
ax3 = fig.add_subplot(3, 3, 3)
ax3.plot([0, 50], [0, 50])
fig

In [None]:
fig, axes = plt.subplots(3, 3)

In [None]:
axes[1, 2].plot(x, y)
axes[1, 2].set_title('my title')
fig.tight_layout()
fig

In [None]:
fig = plt.figure()
ax1 = plt.subplot(2, 2, 1)
ax2 = plt.subplot(2, 2, 3)
ax3 = plt.subplot(1, 2, 2)

ax1.plot(x, y)
ax2.scatter(x, y)
ax3.scatter(1, 1, marker='o')

fig.tight_layout()

In [None]:
ax.clear()

In [None]:
plt.tight_layout()
fig

In [None]:
fig.axes

## Scatter plots

In [None]:
rng = np.random.RandomState(0)

markers = ['o', '.', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']
fig, ax = plt.subplots()
for marker in markers:
    ax.scatter(rng.rand(5), rng.rand(5), marker=marker,
              label=f"marker='{0}'")
    ax.legend(numpoints=1)
    ax.set_xlim(0, 1.8);


In [None]:
rng = np.random.RandomState(0)
x = rng.beta(2, 5, 100)
y = rng.beta(2, 10, 100)
colors = rng.beta(1, 1, 100)
sizes = [1e3 * i * j for i, j in zip(x, y)]
plt.scatter(x, y, c=colors, s=sizes, alpha=0.3, cmap='viridis')
plt.colorbar(); # show color scale

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
features = iris.data.T
plt.scatter(features[0], features[1], alpha=0.2,
s=100*features[3], c=iris.target, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1]);

## Visualizing errors

In [None]:
fig, ax = plt.subplots()

x = np.linspace(0, 10, 50)
dy = 0.8
y = np.sin(x) + dy * np.random.randn(50)

plt.errorbar(x, y, yerr=dy, fmt='.k')

In [None]:
fig, ax = plt.subplots()

x = np.linspace(0, 2 * np.pi, 100)
y1 = np.sin(x)
y2 = np.cos(x)

ax.plot(x, y1, c='r')
ax.plot(x, y2, c='g')


In [None]:
ax.fill_between(x, y1, y2, color='grey', alpha=0.2)
fig

## Density and contour plots

In [None]:
f = lambda x, y: np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x) 

x = np.linspace(0, 5, 50)
y = np.linspace(0, 5, 40)

X, Y = np.meshgrid(x, y)

Z = f(X, Y)

plt.contour(X, Y, Z, colors='red')

In [None]:
from sklearn.datasets import make_regression

X, y = make_regression(n_features=2, noise=10,
                      random_state=42, bias=10)

# add bias
X = np.c_[np.ones(X.shape[0]), X]

df = pd.DataFrame(dict(
    bias=X[:, 0],
    x1=X[:, 1],
    x2=X[:, 2],
    y=y
))
df.head()

In [None]:
h = lambda X, w: X @ w

loss = lambda h, y:(h - y)**2

cost = lambda h, y: loss(h, y).mean()

d_cost = lambda X, w, y: (h(X, w) - y) @ X

def update(X, w, y, alpha=0.01):
    return w - alpha * d_cost(X, w, y)

In [None]:
w = np.random.rand(3)
d_cost(X, w, y)

In [None]:
# repeat until convergence
list_w = []
list_cost = []
for i in range(100):
    w = update(X, w, y, alpha=0.001)
    list_w.append(w)
    list_cost.append(cost(h(X, w), y))

In [None]:
w1, w2, w3 = zip(*list_w)
plt.scatter(w2, w3, c=list_cost, cmap='RdYlGn_r')

In [None]:
pd.DataFrame(dict(true=y, pred=h(X, w))).head(10)

In [None]:
X_mesh, Y_mesh = np.meshgrid(X[:, 0], X[:, 1])
Z = h(X_mesh, Y_mesh)

In [None]:
plt.contour(X_mesh, Y_mesh, Z, colors='red')

# Competition data
The data comes from the [Connectivity Map](https://clue.io/), a project from the Broad institute of MIT and Harvard, which is the world’s largest "perturbation-driven gene expression dataset".

_What is perturbation-driven gene expression?_
- When gene expression is measured after treating the cell with a drug or other perturbation.

The dataset combines cell gene expression and cell viability data. The data is based on a new technology that measures simultaneously (within the same samples) human cells’ responses to drugs in a pool of 100 different cell types (thus solving the problem of identifying ex-ante, which cell types are better suited for a given drug). In addition, you will have access to MoA annotations for more than 5,000 drugs in this dataset.


## Motivation
To advance drug development through improvements to "Mechanism of action" prediction algorithms. A successful MoA algorithm will predict a compound’s MoA given its cellular signature, thus helping scientists advance the drug discovery process.

_What is a mechanism of action?_
- The biological cause-and-effect relationships between a drug, its target, and a disease.

Your task is to use the training dataset to develop an algorithm that automatically labels each case in the test set as one or more MoA classes.

Your solution will be evaluated using the log-loss metric.

In [None]:
df_train = pd.read_csv('../input/lish-moa/train_features.csv', index_col=[0])
df_train

In [None]:
df_train.index.duplicated().any()

In [None]:
pd.Series(i.split('-')[0] for i in df_train.columns).value_counts()

# What does this data mean?
- `sig_id` is the sample id.
- `cp_type` indicates if sample is treated with a _compound_ or _perturbation_.
- `cp_time` treatment duration.
- `cp_dose` high or low dosage.
- `g-x` is gene expression. Notice the genes are _unlabeled_.
- `c-x` is [cell viability](https://www.cellsignal.com/contents/_/synopsis-of-cell-proliferation-metabolic-status-and-cell-death/cell-viability-and-survival#:~:text=Cell%20viability%20is%20a%20measure,as%20during%20a%20drug%20screen.) - a measure of living cells in a sample.

In [None]:
df_train['cp_type'].value_counts()

In [None]:
df_train['cp_dose'].value_counts()

In [None]:
df_train['cp_time'].value_counts()

In [None]:
df_train_target = pd.read_csv('../input/lish-moa/train_targets_scored.csv',
                              index_col=[0])
df_train_target

In [None]:
df_train_target_non = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv',
                                  index_col=[0])
df_train_target_non

## What does the target data mean?
Each column is a MoA target. It means the treatment was intended to perturb that target.

In [None]:
df_train_target_long = df_train_target.melt(var_name='target', value_name='response',
                                            ignore_index=False)
df_train_target_long.head()

In [None]:
df_train_target_long['response'].value_counts()

### Do controls have response?

In [None]:
df_train_target_ctl = df_train[['cp_type']].join(df_train_target).query("cp_type == 'ctl_vehicle'")
df_train_target_ctl.head()

In [None]:
df_train_target_ctl.melt(id_vars='cp_type', var_name='target', value_name='response',
                         ignore_index=False)['response'].value_counts()

# Visualization is communication
Help others solve analytical problems quickly and accurately. 

Visualizations quick-to-understand encodings of data.

Efficient comparisons.

# What is better?

In [None]:
x = np.linspace(0, 2 * np.pi, 100)
y = np.sin(x)
df = pd.DataFrame({'x': x, 'y': y})
df

In [None]:
df.plot(kind='scatter', x='x', y='y')

## Pre-attentive processing
It took 250 ms to understand the graph.

The best visualizations make info available _pre-attentively_.

# The elements of graphing data

The three visual operations of pattern perception:
1. Detection.
2. Assembly.
3. Estimation.
    - Discrimination.
    - Ranking.
    - Ratioing

## There are seven ways to encode values in graphical objects

Ask yourself, what is the most important thing to communicate?

Rank everything you want to communicate by importance, encode those things into graphical objects by going down this ranked-list:
1. Position along a common scale.
2. Position on identical but non-aligned scales.
3. Length.
4. Angle, slope
5. Area.
6. Volume, Density, Color saturation.
7. Hue.

# Back to MoA data - EDA
We want to understand our data. Our goal is to use gene expression, cell viability, and dosage features to predict the MoA target. 

How do our features relate to each other? Any correlations?

How do our features relate to the targets?

Do the targets cluster with each other?

## Feature correlations

In [None]:
cols_genes = [i for i in df_train.columns if i.startswith('g-')]
cols_cv = [i for i in df_train.columns if i.startswith('c-')]

In [None]:
df_genes = df_train[cols_genes].melt(var_name='gene', value_name='expression',
                                     ignore_index=False)
df_genes

---

aside

In [None]:
import sys
sys.getsizeof(df_train[cols_genes])

In [None]:
sys.getsizeof(df_genes)

---

# Clustering genes

In [None]:
import umap

In [None]:
def filter_genes(df, var=1):
    '''Removes genes with low variance'''
    return df.loc[:, df.std() > var]

In [None]:
df_train[cols_genes].std().plot(kind='kde')

In [None]:
df_genes = filter_genes(df_train[cols_genes], 1.5)
df_genes.shape

In [None]:
mapper = umap.UMAP(metric='yule')
mapper.fit(df_genes)

In [None]:
embeddings = mapper.transform(df_genes)

In [None]:
df_emb = pd.DataFrame(embeddings)
df_emb

In [None]:
df_emb.plot(kind='scatter', x=0, y=1)

<div id="6"></div>

# Exercises

[Click here](https://www.machinelearningplus.com/python/101-pandas-exercises-python/)