In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Basic Plots

We'll be using the Kaggle Heart Disease UCI dataset as an example. You can find it here: https://www.kaggle.com/ronitf/heart-disease-uci

This section isn't meant to be ground-breaking, but will introduce you to the common syntax used in pandas, matplotlib and seaborn.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

df = pd.read_csv("../input/heart-disease-uci/heart.csv")

## Bar Plots

Comparing discrete data. Available through either pandas or matplotlib.

In [None]:
df.head()

In [None]:
chest_pain = df.groupby(by="cp").median().reset_index()
chest_pain.head()

In [None]:
chest_pain.plot.bar(x="cp", y="age");

In [None]:
chest_pain.plot.bar(x="cp");

In [None]:
fig, ax = plt.subplots()
ax.bar(chest_pain["cp"], chest_pain["age"], label="age")
ax.set_xlabel("cp")
ax.legend();

In [None]:
fig, ax = plt.subplots()
ax.bar(chest_pain["cp"], chest_pain["age"], label="age", 
       edgecolor="k", color=["red", "blue", "green", "black"])
ax.set_xlabel("cp")
ax.set_xticks(chest_pain["cp"]);

In [None]:
ax = sb.barplot("cp", "age", errcolor="w", capsize=0.1, data=df)

## Scatter Plots


In [None]:
df.plot.scatter("age", "trestbps");

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["age"], df["trestbps"], marker="^", 
           s=60, c=df["age"], edgecolors="k", alpha=0.5)
ax.set_xlabel("age")
ax.set_ylabel("trestbps");

In [None]:
sb.scatterplot("age", "trestbps", hue="age", s=30, edgecolor="none", data=df);

## Line Plots


In [None]:
ages = df.groupby("age").median().reset_index()
ages.head()

In [None]:
ages.plot.line("age", "chol");

In [None]:
ages.plot.line("age", ["chol", "trestbps"]);

In [None]:
fig, ax = plt.subplots()
ax.plot(ages["age"], ages["trestbps"], ls=":", lw=1.7)
ax.set_xlabel("age")
ax.set_ylabel("trestbps");

In [None]:
sb.lineplot("age", "trestbps", data=ages);

### Recap

* bar
* scatter
* line

Doesn't get more fundamental than that.

Hopefully you're now familiar with the basic syntax, so we can move onto more specialised visualisations. 

Now, every plot shown here is a good way to visualise data. There is one unfortunately common way of visualising data which should ideally never be used. Have a guess what it is, and I'll rant about it in the next lecture.

# 2.Plotting methods

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.head(2)

In [None]:
# Can pass axes to pandas for more flexibility
fig, axes = plt.subplots(ncols=2)
df.plot.scatter(x="age", y="chol", ax=axes[0])
df.plot.scatter(x="age", y="trestbps", ax=axes[1])
fig.tight_layout();

In [None]:
# If you dont, and you get an axis, you can get the figure easily
fig2 = axes[0].get_figure()

In [None]:
fig2.savefig("plots.png", bbox_inches="tight", transparent=True)

In [None]:
# You can easily change your style, recommended via a context manager (ie the with statement)
with plt.style.context("default"):
    fig, axes = plt.subplots(ncols=3, sharey=True, 
                             gridspec_kw={"width_ratios": [2, 1, 1], "wspace": 0})
    y = "age"
    xs = ["chol", "trestbps", "thalach"]
    for x, ax in zip(xs, axes):
        ax.scatter(df[x], df[y])
        ax.set_xlabel(x)
    axes[0].set_ylabel(y)
    fig.savefig("output.png", dpi=100, bbox_inches="tight")
    fig.savefig("output.pdf", dpi=100, bbox_inches="tight")

### Recap
* Pandas is great for quickly plots
* maptlotlib is more verbose, but has more control
* You can mix and match. 

# 3.Visualising 1D Distribution

We'll be using the Kaggle Heart Disease UCI dataset as an example. You can find it here: https://www.kaggle.com/ronitf/heart-disease-uci


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.head()

## Histograms

The most common of all ways to show a distribution

In [None]:
df.age.plot.hist(bins=30);

In [None]:
plt.hist(df.age, bins=30);

In [None]:
fig, ax = plt.subplots()
ax.hist(df.trestbps, bins=20, histtype="step", label="trestbps")
ax.hist(df.thalach, bins=20, histtype="stepfilled", label="thalach", alpha=0.3, edgecolor="w")
ax.legend();

## Box Plots

Shows mean, quartiles, IQR and outliers. IQR is inter-quartile range = (75% - 25%). Outliers are points more than 1.5 * IQR away.

In [None]:
df[["trestbps", "thalach"]].plot.box();

In [None]:
df[["trestbps", "thalach"]].describe()

In [None]:
plt.boxplot(df[["trestbps", "thalach"]].to_numpy());

In [None]:
sb.boxplot(data=df[["trestbps", "thalach"]]);

In [None]:
sb.boxplot(x="cp", y="trestbps", data=df);

In [None]:
df.groupby("cp").boxplot(column="trestbps", sharex=True, layout=(1, 4), grid=False)
plt.tight_layout();

## Violin Plots

Like box plots... but better. Not a staple of pandas at the moment (though it has been requested):

In [None]:
fig, ax = plt.subplots()
ax.violinplot(df[["trestbps", "thalach"]].to_numpy());

In [None]:
# Control the bandwidth to control the smoothing.
fig, ax = plt.subplots()
ax.violinplot(df[["trestbps", "thalach"]].to_numpy(), bw_method=0.2);

In [None]:
sb.violinplot(data=df[["trestbps", "thalach"]], inner="quartile", bw=0.2);

## Bee swarm plots

If you want to go fancy, these can be fun for presentations, but less concise than other plots.

In [None]:
sb.swarmplot(data=df[["trestbps", "thalach"]], size=3);

In [None]:
sb.violinplot(data=df[["trestbps", "thalach"]], inner=None);
sb.swarmplot(data=df[["trestbps", "thalach"]], size=2, color="k", alpha=0.5);

Often combining this plot is the right choice. You can see here it reveals structure in the violin plot for trestbps that the KDE is smoothing over.

### Recap

* histogram
* box
* violin
* bee swarm

# 4.Visualising 2D data

For this example, we'll use the Meteor Impact data from NASA, available here:
https://www.kaggle.com/nasa/meteorite-landings

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

df = pd.read_csv("../input/meteorite-landings/meteorite-landings.csv")
df.head()

In [None]:
df = df.dropna(subset=["reclong", "reclat"])
df = df[df.reclong < 300]

## 2D Histograms

The staple

In [None]:
plt.hist2d(df.reclong, df.reclat, bins=200, vmax=4)
plt.colorbar();

## 2D Hex plots

In [None]:
plt.hexbin(df.reclong, df.reclat, bins=200, vmax=4, lw=0.0)
plt.colorbar();

In [None]:
df.plot.hexbin(x="reclong", y="reclat", vmax=2, gridsize=100, linewidth=0.25);

## Contour

It's easiest to show these plots using data with broader shapes, so let's make some.

In [None]:
spacing = np.linspace(0, 10, 200)
X, Y = np.meshgrid(spacing, spacing)
Z = (np.sin(X) + np.cos(Y) + 2 * np.arcsinh(X * Y))**2

plt.contour(X, Y, Z, levels=20)
plt.colorbar();

In [None]:
c = plt.contour(X, Y, Z, levels=20)
plt.clabel(c, inline=True, fmt="%0.1f")
plt.colorbar();

In [None]:
c = plt.contourf(X, Y, Z, levels=20)
plt.colorbar();

In [None]:
plt.contourf(X, Y, Z, levels=10)
c = plt.contour(X, Y, Z, levels=10, colors="black")
plt.clabel(c, inline=True, fmt="%0.1f");

## KDE

To look at KDE, let me quickly draw some samples from the above surface

In [None]:
# This is called rejection sampling, a way to brute force sample any surface, so long
# as you're fine waiting for slow code
n = 50000
xs, ys = np.random.uniform(0, 10, n), np.random.uniform(0, 10, n)
zs = (np.sin(xs) + np.cos(ys) + 2 * np.arcsinh(xs * ys))**2
zs /= zs.max()
passed = np.random.uniform(0, 1, n) < zs
xs, ys = xs[passed], ys[passed]
plt.scatter(xs, ys, s=1, alpha=0.2);

In [None]:
sb.kdeplot(xs, ys);

In [None]:
sb.kdeplot(xs, ys, bw=2.0);

In [None]:
sb.kdeplot(xs, ys, bw=0.2);

## Jointplots

Joint plots allow you to swap out the interior and marginal plots to get something the perfectly suits your needs.

In [None]:
sb.jointplot(data=df, x="reclong", y="reclat");

In [None]:
sb.jointplot(data=df, x="reclong", y="reclat", kind="hex", 
             gridsize=100, vmax=3, linewidth=0, marginal_kws={"bins": 100});

In [None]:
sb.jointplot(x=xs, y=ys, kind="kde");

In [None]:
sb.jointplot(x=xs, y=ys, kind="hex", gridsize=20, cmap="magma");

In [None]:
sb.pairplot(data=df[["reclat", "reclong", "mass"]]);

### Recap

* plt.hist2d
* plt.hexbin
* df.plot.hexbin
* plt.contour
* plt.contourf
* sb.kdeplot
* sb.jointplot

# 5.2D Dists continued!

## Pandas styling for tables!

What if instead of having a ton of data to bin, we simply have a 2D table in pandas?

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.normal(size=(6,6)), columns=[x for x in "ABCDEF"])
df

In [None]:
def neg_red(x):
    return f"color: {'red' if x < 0 else 'gray'}"
df.style.applymap(neg_red)

In [None]:
def gold_max(xs):
    m = xs.to_numpy().max()
    color = {True: "background-color: #c78f2e", False: ""}
    is_max = (xs == m).replace(color)
    return is_max
df.style.apply(gold_max, axis=None)

In [None]:
df.style.background_gradient(cmap="magma")

In [None]:
df.abs().style.bar(align="left", width=90)

In [None]:
head = {"selector": "th", "props": [("text-align", "center")]}
df.style.set_table_styles([head]).bar(align="mid", color=['red', 'green'], vmin=-3, vmax=3)

In [None]:
df.style.bar(align="mid", width=50).apply(gold_max, axis=None).applymap(neg_red)

# 6.Visualising ND data

Unfortunately, human brains are not the best at visualising higher dimensions. To understand high dimensional data is its own course, and libraries like `sklearn` have many algorithms for finding structure in high dimensional data and showing you those relationships in lower dimensional plots.

For this section, we're going to use the heart data agin (https://www.kaggle.com/ronitf/heart-disease-uci), plus we're going to make some data ourselves.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn import manifold

df = pd.read_csv("../input/heart-disease-uci/heart.csv")
target = df["target"].to_numpy()
df = df[["age", "sex", "cp", "trestbps", "chol", "thalach"]]
df.head()

## Scatter Matrix

This is the easiest and most commonly used way of visualising ND data. It works by breaking it down to many 2D surfaces.

In [None]:
pd.plotting.scatter_matrix(df);

## Correlation Matrix

A very quick view of how much each parameter (column) is correlated with another. 

In [None]:
df.corr()

In [None]:
ax = sb.heatmap(df.corr(), annot=True, fmt="0.2f", square=True);

## What about more complicated methods?

So let's generate a 4-dimensional spiral and see what we can do

In [None]:
np.random.seed(0)
n = 2000
t = np.linspace(0, 20, n)
x = t * np.sin(t) + 0.2 * np.random.normal(size=n)
y = t * np.cos(t) + 0.2 * np.random.normal(size=n)
z = np.log(t + 1) * np.sin(np.sqrt(t)) + 0.1 * np.random.normal(size=n)
a = np.log(t + 1) * np.cos(np.sqrt(t)) + 0.1 * np.random.normal(size=n)

df2 = pd.DataFrame({"x": x, "y": y, "z": z, "a": a})
data = df2.to_numpy()

In [None]:
pd.plotting.scatter_matrix(df2);

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(x, y, z, c=t, s=3+a)

So, its an interesting pattern, and we can see some clear relationships on some of the slices, but not all of them.

## Manifold Learning

There are lots of techniques we could use, but I think manifold learning is the best. See https://scikit-learn.org/stable/modules/manifold.html for all of the easy to use `sklearn` implementations. Note that the next section closely follows the sklearn example, so I'd recommend having a look at that link if you want to know more.

In [None]:
methods = [
    ("LLE", manifold.LocallyLinearEmbedding(n_neighbors=20, method='standard')),
    ("LTSA", manifold.LocallyLinearEmbedding(n_neighbors=20, method='ltsa')),
    ("Hessian LLE", manifold.LocallyLinearEmbedding(n_neighbors=20, method='hessian')),
    ("Modified LLE", manifold.LocallyLinearEmbedding(n_neighbors=20, method='modified')),
    ("Isomap", manifold.Isomap()),
    ("MDS", manifold.MDS(n_init=1)),
    ("SE", manifold.SpectralEmbedding()),
    ("t-SNE", manifold.TSNE()),
]

fig, axes = plt.subplots(nrows=2, ncols=4)
for (name, method), ax in zip(methods, axes.flatten()):
    print(f"Running {name}")
    Y = method.fit_transform(data)
    ax.scatter(Y[:, 0], Y[:, 1], c=t)
    ax.set_title(name)
    ax.set_xticklabels([])
    ax.set_yticklabels([])

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4)
for (name, method), ax in zip(methods, axes.flatten()):
    print(f"Running {name}")
    try:
        Y = method.fit_transform(df.to_numpy())
        ax.scatter(Y[:, 0], Y[:, 1], c=target)
    except Exception:
        print(f"Method {name} failed")
    ax.set_title(name)
    ax.set_xticklabels([])
    ax.set_yticklabels([])

### Recap

* scatter_matrix
* sb.pairplot
* correlation matrix
* Manifold learning (or other ways of compressing dimensionality)

# 7.Extra Practise

For an optional bit of "on your own" practise, lets take this Kaggle Pokemon dataset (https://www.kaggle.com/abcsds/pokemon) and try and answer a few questions visually:

1. What does the Attack vs Defense distribution look like? 
    1. Advanced: Break it down by Type
2. What are the top 10 pokemon by summed attributes?
    1. Can you modify the load in code and do this for a different generation, or only for "Mega" pokemon?
3. What are the distributions of battle states (HP, attack through to speed)?
    1. Even better, what are the distributions, by type of pokemon, for the above question

Download the data, and this code should get you started. Remember that there are a thousand ways to answe these questions. If your solution looks completely different to mine, that's fine! I'm going to quickly restrict the data and only look at Gen1 Pokemon without the "Mega" label for simplicity.****

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sb
import numpy as np

df = pd.read_csv("../input/pokemon/Pokemon.csv")
# Filtering on the data. Don't get caught up on this, we'll cover it very soon!
df = df[df.Generation == 1 & ~df.Name.str.contains("Mega")]
df.head()

## Pokemon Attack and Defense Correlation

In [None]:
# This is the most basic plot we could make
df.plot.scatter("Attack", "Defense", s=5);

In [None]:
# To do it by type we have a few options we can use. The easiest is to use on of seaborns
# million different plotting functions, so here's a new one for you:

sb.lmplot("Attack", "Defense", hue="Type 1", data=df, fit_reg=False);

In [None]:
# Or we could define our own colour pallete and use a function you'll see in the next chapter

# Get all the types
types = df["Type 1"].unique()
# Get colours for each type
cs = matplotlib.cm.tab20(np.linspace(0, 1, len(types)))

fig, ax = plt.subplots(figsize=(12, 6))
for c, t in zip(cs, types):
    df2 = df[df["Type 1"] == t]
    ax.scatter(df2["Attack"], df2["Defense"], color=c, label=t)
ax.legend(), ax.set_xlabel("Attack"), ax.set_ylabel("Defense");

In [None]:
# Another way would be to use GroupBy, or add a new column to make the above code more efficient.
# But don't want to get too ahead of myself

## Strongest Pokemon by Summed Stats

In [None]:
# So technically this is correct, but its a bit hard to read
df.plot.bar("Name", "Total");

In [None]:
# So heres a way of doing it, again using functions from the next chapter.
# We sort by Total, take the top 20 rows, and plot
df.sort_values("Total", ascending=False).iloc[:20, :].plot.bar("Name", "Total");

## Distribution of Battle stats

In [None]:
# You might notice that the default scatter_matrix you tried might throw an error,
# complaining about numpy boolean subtract (depending on your versions of pandas and numpy)
# You can either turn Legendary into a number (using astype) if that happens, or not plot it

columns = df.columns[5:-2]
pd.plotting.scatter_matrix(df[columns]);

In [None]:
# To get separate colours, lets try adding that new column. If you want to know how it all works
# keep watching, its coming soon!

# Using the types and cs from previous code
mapping = {t: c for t, c in zip(types, cs)}
pd.plotting.scatter_matrix(df[columns], color=df["Type 1"].map(mapping));

In [None]:
# To get the histograms broken down too, we'd normally move away from scatter_matrix
# and do it ourselves