In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from scipy import stats
from scipy.integrate import trapz


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set(style='darkgrid')
tips = sns.load_dataset('tips')

In [None]:
tips.head()

# Data Exploration and Insights

## Visualising statistical relationships.

In [None]:
sns.relplot(x='total_bill', y='tip',hue='smoker',style='time', data=tips);

In [None]:
sns.relplot(x='total_bill', y='tip', hue='size', data=tips);

In [None]:
sns.relplot(x='total_bill', y='tip', hue='smoker', style='time', size='size',sizes=(15, 200), data=tips);

### Emphasising continuity with lineplots.

In [None]:
df = pd.DataFrame(dict(time=np.arange(500),
                       value=np.random.randn(500).cumsum()))
sns.relplot(x='time', y='value', data=df, kind='line').fig.autofmt_xdate();

In [None]:
df = pd.DataFrame(np.random.randn(500, 2).cumsum(axis=0), columns=["x", "y"])
sns.relplot(x="x", y="y", sort=False, kind="line", data=df);

In [None]:
sns.relplot(x="x", y="y", sort=True, kind="line", data=df);

### Agregation and representing uncertainity.

In [None]:
fmri = sns.load_dataset('fmri')
fmri.head()

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, kind='line');

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, kind='line', ci=None); # ci=None can also be ci=False

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, kind='line', ci='sd'); # sd stand for - standard devieation

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, kind='line', estimator=None); # CANNOT  use estimator=False; only None.

### Plotting subsets of data with semantic mapping.

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, hue='event', kind='line');

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, hue='region', style='event', kind='line');

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, kind='line', hue='region', style='event', dashes=False, markers=True);

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri.query("event=='cue'"), kind='line', estimator=None, units='subject', hue='region');

In [None]:
dots = sns.load_dataset('dots').query("align=='dots'")
dots

In [None]:
sns.relplot(x='time', y='firing_rate', data=dots, hue='coherence', style='choice', kind='line');

In [None]:
sns.relplot(x='time', y='firing_rate', data=dots, style='choice', kind='line', hue_norm=LogNorm(), size='coherence');

In [None]:
df = pd.DataFrame(dict(time=pd.date_range("2017-1-1", periods=500),
                       value=np.random.randn(500).cumsum()))
g = sns.relplot(x="time", y="value", kind="line", data=df)
g.fig.autofmt_xdate()

In [None]:
sns.relplot(x='total_bill', y='tip', data=tips, col='time', hue='smoker');

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri, hue='subject', col='region', row='event', height=5, kind='line');

In [None]:
sns.relplot(x='timepoint', y='signal', data=fmri.query("region=='frontal'"),
            hue='event', style='event', col='subject',
            col_wrap=4, kind='line', estimator=None,
            height=3, aspect=.75, linewidth=2.5,);

## Plotting categorical data.

### Categorical Scatterplots - Swarmplots

In [None]:
sns.set(style='ticks', color_codes=True)

In [None]:
sns.catplot(x='day', y='total_bill', data=tips);

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, jitter=False);

In [None]:
sns.swarmplot(x='day', y='total_bill', data=tips);

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, hue='sex');

In [None]:
sns.catplot(x='size', y='total_bill', data=tips, kind='swarm');

In [None]:
sns.catplot(x='smoker', y='tip', data=tips, order=['No', 'Yes']);

In [None]:
sns.catplot(x='total_bill', y='day', data=tips, kind='swarm', hue='time');

### Visualising distribution of observations within categories.
#### Boxplots

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, kind='box', palette='magma');

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, kind='box', hue='smoker');

In [None]:
tips['weekend'] = tips['day'].isin(['Sat', 'Sun'])
sns.catplot(x='day', y='total_bill', data=tips, kind='box', hue='weekend', dodge=False);

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, kind='box', hue='weekend');

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds.head()

#### Boxenplot
These also show the shape of distribution.


In [None]:
sns.catplot(x='color', y='price',data=diamonds.sort_values('color'), kind='boxen');

#### Violinplot


In [None]:
sns.catplot(x='total_bill', y='day', data=tips, kind='violin', hue='sex');

In [None]:
sns.catplot(x="total_bill", y="day", hue="sex",
            kind="violin", bw=.15, cut=0,
            data=tips);

In [None]:
sns.catplot(x='day', y='total_bill', data=tips, kind='violin', split=True, hue='sex');

In [None]:
sns.catplot(x="day", y="total_bill", hue="sex",
            kind="violin", inner="stick", split=True,
            palette="pastel", data=tips);

In [None]:
g = sns.catplot(x="day", y="total_bill", kind="violin", inner=None, data=tips, palette='pastel')
sns.swarmplot(x="day", y="total_bill", color="k", size=3, data=tips, ax=g.ax); # we used swarmplot instead of catlpot as
# catplot is a figure level function.

## Statistical estimation within categories.

### Barplots

In [None]:
titanic = sns.load_dataset('titanic')
titanic.head()

In [None]:
sns.catplot(x='sex', y='survived', hue='class', kind='bar', data=titanic, palette='pastel');

In [None]:
sns.countplot(x='deck', palette='pastel', data=titanic, hue='class');

In [None]:
sns.catplot(x='sex', y='survived', data=titanic, kind='point', hue='class');

In [None]:
sns.catplot(x='class', y='survived', data=titanic, kind='point', hue='sex', linestyles=['-', '-.'], markers=['^', 'o']);

### Plotting wide-form data

In [None]:
iris = sns.load_dataset('iris')
iris.head()

In [None]:
sns.boxplot(data=iris, orient='h');

In [None]:
iris['petal_length'].describe()

In [None]:
sns.violinplot(x='species', y='sepal_length', data=iris);

In [None]:
tips

In [None]:
sns.catplot(x="day", y="total_bill", hue="smoker",
            col="time", aspect=.6,
            kind="swarm", data=tips);

In [None]:
g = sns.catplot(x="fare", y="survived", row="class",
                kind="box", orient="h", height=1.5, aspect=4,
                data=titanic.query("fare > 0"))
g.set(xscale="log");

## Visualising Distribution of Data

In [None]:
sns.set(color_codes=True)

### Plotting uni-variate distributions

In [None]:
x = np.random.normal(size=100);

In [None]:
sns.distplot(x);

* ### Histograms

In [None]:
sns.distplot(x, rug=True, kde=False);

The number of bins is automatically chosen; the below plot shows that.

In [None]:
sns.distplot(x, rug=True, kde=False, bins=20); # bins=20 => 20 categories

* ###  Kernel Density Estimation
KDE plots encode the density of observation on one axis with height on representing density.

In [None]:
sns.distplot(x, hist=False, rug=True);

The kernel density estimate first replaces each observation with a normal Gaussian curve.

In [None]:
x = np.random.normal(0, 1, size=30)
bandwidth = 1.06 * x.std() * x.size ** (-1 / 5.)
support = np.linspace(-4, 4, 200)

kernels = []
for x_i in x:

    kernel = stats.norm(x_i, bandwidth).pdf(support)
    kernels.append(kernel)
    plt.plot(support, kernel, color="r")

sns.rugplot(x, color=".2", linewidth=3);

Next these curves are summed up to compute the value of density at each point in the support grid; the resulting curve is then normalised so as to give 1 as the area under the curve. 

In [None]:
density = np.sum(kernels, axis=0)
density /= trapz(density, support)
plt.plot(support, density);

The sns.kdeplot function gives a more direct interface with easier access to more kde-related functions.

In [None]:
sns.kdeplot(x, shade=True);

The **bandwidth('bw')** parameter controls how to **tightly the estimation is to fit the data** , pretty much like bin size in histogram. It corresponds to the width of the kernel we plot. The default behaviour tries to guess a good value using a common reference rule.

In [None]:
sns.kdeplot(x, shade=True);
sns.kdeplot(x, bw=.2, label='bw:0.2');
sns.kdeplot(x, bw=2, label='bw:2');

As you can see above, the nature of the Gaussian KDE(green) process means that estimation extends past the largest and smallest values in the dataset. It’s possible to control how far past the extreme values the curve is drawn with the **cut parameter**; however, this only influences **how the curve is drawn and not how it is fit**:

In [None]:
sns.kdeplot(x, cut=0, shade=True);  # cut=0 implies the curve stops with the curve.
sns.kdeplot(x, cut=1);              # positive value of cut implies the curve extends past the actual observations.
sns.kdeplot(x, cut=2);      
sns.kdeplot(x, cut=-1);             # negative value of cut implies the curve stops before the actual farthest observations.
sns.rugplot(x);

* ### Fitting parametric distributions

In [None]:
x = np.random.gamma(6, size=200)
sns.distplot(x, fit=stats.gamma);

## Plotting Bivariate Distributions

In [None]:
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])
df.head()

### Scatterplots
Plotting 2 different variables on separate axes can be done easily with this; its similar to a rug plot on two dimensions.

sns.jointplot() plots the variables on multiple panels showing its distribution as well as correlation.

In [None]:
sns.jointplot(x='x', y='y', data=df);

### Hexbin plot
It is to bivariate data what histogram is to univariate data. 


In [None]:
x, y = np.random.multivariate_normal(mean, cov, 1000).T
with sns.axes_style("white"):
    sns.jointplot(x=x, y=y, kind="hex", color="k");

### Kernal Density Estimation


In [None]:
sns.jointplot(x='x', y='y', data=df, kind='kde');

We can also draw a two dimensional KDE plot.

In [None]:
f, ax = plt.subplots(figsize=(6, 6))
sns.kdeplot(df.x, df.y, ax=ax)
sns.rugplot(df.x, color="g", ax=ax)
sns.rugplot(df.y, vertical=True, ax=ax);

In [None]:
f, ax = plt.subplots(figsize=(6, 6))
sns.kdeplot(df.x, df.y, ax=ax, shade=True)
sns.rugplot(df.x, ax=ax)
sns.rugplot(df.y, vertical=True, ax=ax);

To show bivariate density more continously we can increase the number of contour levels.

In [None]:
f, ax = plt.subplots(figsize=(6, 6))
cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(df.x, df.y, cmap=cmap, n_levels=60, shade=True);

Jointplot returns a JointGrid after plotting,for more flexibilty we can use the Jointgrid directly.

In [None]:
g = sns.jointplot(x="x", y="y", data=df, kind="kde", color="m")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$");

### Visualising Pairwise Relationships
To visualise multiple pairwise bivariate distribution we can use the pairplot function

In [None]:
sns.pairplot(iris);

Adding the hue semantic changes histogram to kde for comparison between multiple distributions.

In [None]:
sns.pairplot(iris, hue='species');

Similar to how the jointplot is built on top jointgrid; pairplot is built on top of PairGrid.

In [None]:
g = sns.PairGrid(iris)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6, shade=True);

## Visualising Linear Relationships


In [None]:
sns.set(color_codes=True)

## Functions to draw linear regression models
Two main functions in seaborn are used to visualize a linear relationship as determined through regression. These functions, regplot() and lmplot() are closely related, and share much of their core functionality. It is important to understand the ways they differ, however, so that you can quickly choose the correct tool for particular job.

In [None]:
sns.regplot(x='total_bill', y='tip', data=tips);

In [None]:
sns.lmplot(x='total_bill', y='tip', data=tips);

 **regplot()** accepts the x and y variables in a variety of formats including simple numpy arrays, pandas Series objects, or as references to variables in a pandas DataFrame object passed to data. In contrast, **lmplot()** has data as a required parameter and the x and y variables must be specified as strings.(Tidy data)

It’s possible to fit a linear regression when one of the variables takes discrete(categories) values, however, the simple scatterplot produced by this kind of dataset is often not optimal:

In [None]:
sns.lmplot(x='size', y='tip', data=tips);

One option is to add some random noise (“jitter”) to the discrete values to make the distribution of those values more clear. Note that jitter is applied only to the scatterplot data and does not influence the regression line fit itself:

In [None]:
sns.lmplot(x='size', y='tip', data=tips, x_jitter=0.05);

A second option is to collapse over the observations in each discrete bin to plot an estimate of central tendency along with a confidence interval:

In [None]:
sns.lmplot(x='size', y='tip', data=tips, x_estimator=np.mean);

## Fitting Various types of Models

In [None]:
anscombe = sns.load_dataset('anscombe')
anscombe

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='I'"));

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='I'"),
           ci=None);

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='II'"),
          ci=None, scatter_kws={'s':80});

In presence of higher-order correlated data, **regplot() and lmplot()** can fit a polynomialk regression model, to explore non-linear trends in data.

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='II'"),
           order=2, ci=None);

A different problem is posed by outliers that deviate for some reason other than the main relationship in study.

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='III'"),
           ci=None);

In presence of outliers we use **robust regression** which has a different weight loss function that downweights relatively large residuals.

In [None]:
sns.lmplot(x='x', y='y', data=anscombe.query("dataset=='III'"),
          robust=True, ci=None);

With y variable binary simple linear regression returns implausible predictions.

In [None]:
tips["big_tip"] = (tips.tip / tips.total_bill) > .15
sns.lmplot(x='total_bill', y='big_tip', data=tips,
          y_jitter=0.03);

The solution in this case is to fit a **logistic regression**, such that the regression line shows the estimated **probability of y = 1 for a given value of x**.

In [None]:
sns.lmplot(x='total_bill', y='big_tip', data=tips,
          logistic=True, y_jitter=0.03);

An altogether different approach is to fit a nonparametric regression using a lowess smoother. This approach has the fewest assumptions, although it is computationally intensive and so currently confidence intervals are not computed at all.

In [None]:
sns.lmplot(x='total_bill', y='big_tip', data=tips, lowess=True, y_jitter=0.03);

The residplot() function can be a useful tool for checking whether the simple regression model is appropriate for a dataset. It fits and removes a simple linear regression and then plots the residual values for each observation. Ideally, these values should be randomly scattered around y = 0:

In [None]:
sns.residplot(x="x", y="y", data=anscombe.query("dataset == 'I'"),
              scatter_kws={"s": 80});

In [None]:
sns.residplot(x="x", y="y", data=anscombe.query("dataset == 'II'"),
              scatter_kws={"s": 80});

## Conditioning on other variables

In [None]:
sns.lmplot(x='total_bill', y='tip', data=tips, hue='smoker');

In [None]:
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips,
           markers=["o", "x"], palette="magma");

In [None]:
sns.lmplot(x="total_bill", y="tip", hue="smoker", col="time", data=tips);

In [None]:
sns.lmplot(x="total_bill", y="tip", hue="smoker",
           col="time", row="sex", data=tips);

## Controlling the size and shape of plot

Before we noted that the default plots made by regplot() and lmplot() look the same but on axes that have a different size and shape. This is because regplot() is an “axes-level” function draws onto a specific axes. This means that you can make multi-panel figures yourself and control exactly where the regression plot goes. If no axes object is explicitly provided, it simply uses the “currently active” axes, which is why the default plot has the same size and shape as most other matplotlib functions. To control the size, you need to create a figure object yourself.

In [None]:
f, ax = plt.subplots(figsize=(5, 6))
sns.regplot(x="total_bill", y="tip", data=tips, ax=ax);

In contrast, the size and shape of the lmplot() figure is controlled through the FacetGrid interface using the height and aspect parameters, which apply to each facet in the plot, not to the overall figure itself:

In [None]:
sns.lmplot(x="total_bill", y="tip", col="day", data=tips,
           col_wrap=2, height=3);

In [None]:
sns.lmplot(x="total_bill", y="tip", col="day", data=tips,
           aspect=.5);

## PLotting a regression in other context

In [None]:
sns.jointplot(x="total_bill", y="tip", data=tips, kind="reg");

In [None]:
sns.pairplot(tips, x_vars=["total_bill", "size"], y_vars=["tip"],
             height=5, aspect=.8, kind="reg");

In [None]:
sns.pairplot(tips, x_vars=["total_bill", "size"], y_vars=["tip"],
             hue="smoker", height=5, aspect=.8, kind="reg");

## Building Structured multi-plots grids
* Tidy dataframe is required
* Useful for huge data intake


In [None]:
sns.set(style='ticks')

## Conditional small multiples

The FacetGrid class is useful when you want to visualize the distribution of a variable or the relationship between multiple variables separately within subsets of your dataset. A FacetGrid can be drawn with up to three dimensions: row, col, and hue. The first two have obvious correspondence with the resulting array of axes; think of the hue variable as a third dimension along a depth axis, where different levels are plotted with different colors.

In [None]:
g = sns.FacetGrid(tips, col='time');

##### To plot: 
Provide the FaceGrid object with the names of variables and plotting function.
Example: comparison of tips given during lunch and dinner.

In [None]:
g = sns.FacetGrid(tips, col='time')
g.map(plt.hist, 'tip');

In [None]:
sns.FacetGrid(tips, col='time').map(plt.hist, 'tip');

In [None]:
sns.FacetGrid(tips, col='sex',
              hue='smoker').map(plt.scatter, 'total_bill',
              'tip', alpha=0.7).add_legend();

In [None]:
sns.FacetGrid(tips, row='smoker',
              col='time', margin_titles=True).map(sns.regplot, 'size', 'total_bill', 
                                                 fit_reg=False, x_jitter=.1, color='.3');

In [None]:
sns.FacetGrid(tips, col='day', height=4, aspect=.5).map(sns.barplot, 'sex', 'total_bill', palette='magma');

In [None]:
ordered_days = tips['day'].value_counts().index
ordered_days

In [None]:
sns.FacetGrid(tips, row='day', row_order=ordered_days,
             height=1.7, aspect=4).map(sns.distplot, 'total_bill', hist=False, rug=True);

In [None]:
sns.FacetGrid(tips, hue='time', palette='magma', height=7).map(plt.scatter, 'total_bill', 'tip').add_legend();

In [None]:
sns.FacetGrid(tips, hue='sex', palette='Set1', height=5.5, 
              hue_kws={'marker':['^', 'v']}).map(plt.scatter, 'total_bill', 'tip').add_legend();

While having variables with multiple level; we can plot it along columns but will have to **wrap** so as to span multiple columns. **ROWS keyword** cannot be usedto do the same.

In [None]:
attend = sns.load_dataset('attention', index_col=0).query("subject <= 12")
attend

In [None]:
sns.FacetGrid(attend, col='subject', col_wrap=4, 
              height=2, ylim=(0, 10)).map(sns.pointplot, 'solutions', 'score', 
                                         order=[1,2,3], color='.3', ci=None);