In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

# Notebook setup

In [None]:
# only for the notebook
%matplotlib inline
# only in the ipython shell
# %matplotlib
%config InlineBackend.figure_format='svg' #Makes the plots a vector graphic in the inline display
import matplotlib.pyplot as plt
# Make the size and fonts larger for this presentation
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2
import numpy as np

# `Uncertainties` module

The uncertainties module offers easy ,,mote-carlo'' type error propagation using almost all `numpy` functions, or uses heuristic for simple operations and adds the errors in quadrature

install the `uncertainties` module with `pip install uncertainties`

In [None]:
from uncertainties import ufloat

In [None]:
x = ufloat(5, 1)
y = ufloat(3, 1)

x + y

In [None]:
x = np.array([np.random.normal(5, 1) for _ in range(100000)])
y = np.array([np.random.normal(3, 1) for _ in range(100000)])
print(x.mean(), x.std())
print(y.mean(), y.std())
z = x + y
print(z.mean(), z.std())

Correlations are respected

In [None]:
x = ufloat(3, 1)
y = ufloat(3, 1)

print(x - y)
print(x - x) # error is zero!

print(x == y)


To use numpy functions, there is an implementation of many `numpy` functions in `uncertainties` called `unumpy`

In [None]:
from uncertainties import unumpy as unp

In [None]:
x = [1, 2, 3, 4, 5]
err = [0.1, 0.3, 0.1, 0.8, 1.0]

y = unp.uarray(x, err)

unp.cos(unp.exp(y))

Access the central value and the uncertainty with `n` and `s`:

In [None]:
x = ufloat(5, 1)
print(x.n)
print(x.s)

In `unumpy` with `nominal_values` and `std_devs`

In [None]:
x = unp.uarray([1, 2, 3], [0.3, 0.3, 0.1])
print(unp.nominal_values(x))
print(unp.std_devs(x))

... or make a shortcut

In [None]:
from uncertainties.unumpy import (nominal_values as noms,
                                  std_devs as stds)

print(noms(x))
print(stds(x))

## Correlated values

In [None]:
x = np.array([90, 60, 45, 100, 15, 23, 52, 30, 71, 88])
y = np.array([90, 71, 65, 100, 45, 60, 75, 85, 100, 80])

plt.plot(x, y, 'ro')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

Guessing linear correlation of the values, can be checked with the correlation coefficient:

$$ r = \frac{\mathrm{cov}(x, y)}{\sigma_x \sigma_y}\:,\quad -1 \leq r \leq 1 $$

In [None]:
x_mean = np.mean(x)
y_mean = np.mean(y)

dx = x - x_mean
dy = y - y_mean
corr_coeff = np.sum(dx * dy) / np.sqrt(np.sum(dx**2) * np.sum(dy**2))
print(corr_coeff)

Generate correlation of values with `correlated_values`:

In [None]:
from uncertainties import correlated_values

values = [1, 2]

cov = [[0.5, 0.25],
       [0.25, 0.2]]

x, y = correlated_values(values, cov) 

In [None]:
 x + y

BUT: Cannot use ufloats in plots!

In [None]:
x = np.linspace(0, 10)
y = unp.uarray(np.linspace(0, 5), 1)

plt.plot(x, y, 'rx')

In [None]:
x = np.linspace(0, 10)
y = unp.uarray(np.linspace(0, 5), 1)

plt.errorbar(x, unp.nominal_values(y), yerr=unp.std_devs(y), fmt='rx')

# `Pandas` (where the real magic happens)

![Pandas](https://pandas.pydata.org/pandas-docs/stable/_static/pandas.svg)

- __Great__ data analysis toolkit
- makes io easy, presents complicated data in a readable format
- inbuilt plotting routines from `matplotlib`

In [None]:
import pandas as pd

In [None]:
#some data
data = np.random.uniform(-1, 1, 100).reshape(10, 10)

In [None]:
pd.DataFrame(data=data)

Lets give the columns some names:

In [None]:
labels = ['col'+str(i+1) for i in range(len(data.T))]
pd.DataFrame(data=data, columns = labels)

and the rows (not recommended)

In [None]:
r_labels = ['row'+str(i+1) for i in range(len(data))]
pd.DataFrame(data=data, columns = labels, index = r_labels)

Access easily the columns / rows 

In [None]:
df = pd.DataFrame(data=data, columns = labels, index = r_labels)
print(df.col1,'\n')
print(df['col1'], '\n')
print(df.loc['row1'], '\n')

In [None]:
print(list(df['col1']), '\n')
print(list(df.loc['row1']), '\n')

In [None]:
print(dict(df['col1']), '\n')
print(dict(df.loc['row1']), '\n')

Get column / row names

In [None]:
df.keys()

In [None]:
df.index

Access row by its index (recommended, because for large data sets MUUUUUUUCH faster)

In [None]:
df.iloc[0]

Use first or last n lines with `head` or `tail`

In [None]:
df.head(5)

In [None]:
df.tail(5)

Delete coloumns or rows

In [None]:
df.drop('row1', axis=0)

In [None]:
df.drop('col1', axis=1)

insert new column

In [None]:
df.insert(6, column= 'col6.5', value=np.ones(10))
df

In [None]:
df = df.drop('col6.5', axis=1)
df

You can also automagically read `.csv` files for instance or automagically read numpy recarrays

In [None]:
df_csv = pd.read_csv('data.csv') #from numpy notebook
df_csv

Can also create `DataFrame` from a dictionary of lists or a list of dictionaries, `.hdf5` files etc. $\Rightarrow$ read the docs <url>https://pandas.pydata.org/</url>

Lets read in some data saved as a `numpy recarray`

In [None]:
data_rec = np.load('data_recarray.npy')
data_rec

Data like this can be processed quickly making use of `numpy`'s powers, but human readibility is a mess

let's convert to a pandas `DataFrame`

In [None]:
df_rec = pd.DataFrame(data=data_rec)
df_rec

The last line of the `recarray` encoding and describing the different datatypes is automagically used as the column labels!

Display all columns, just to see easily whats in it:

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

In [None]:
df_rec.head(10)

Marvellous!

## Built in plot functions

In [None]:
df_rec.plot('posX', 'posY', label='particle tracks')
plt.xlabel(r'$x$')
plt.ylabel(r'$y$')
plt.grid()

In [None]:
# df_rec.plot.scatter('momDirX', 'momDirY')
# plt.xlabel(r'$x$')
# plt.ylabel(r'$y$')
# plt.grid()

In [None]:
df_rec.hist('posZ', density=True, )

## `numpy` magic

i.e. all `numpy array` methods work

In [None]:
df_rec['edep'].mean()

In [None]:
df_rec['edep'].std()

In [None]:
df_rec['edep'].sum()

In [None]:
df_rec.loc[[True, True, False, False, True]]

In [None]:
df_rec.loc[[1, 0, 3]] #if indices are strings, use .iloc[] for numbers

Fast sorting

In [None]:
df_rec.nlargest(10, 'edep')

In [None]:
%%timeit
df_rec.nsmallest(10, 'edep')

In [None]:
%%timeit
df_rec.sort_values('PDGEncoding')

In [None]:
df_rec.sort_values('PDGEncoding')

Use `numpy` mathematical functions for single columns / rows or the whole `DataFrame`:

In [None]:
df.apply(lambda x: np.cos(x[['col1', 'col7']]), axis=1)

In [None]:
df.apply(lambda x: np.exp(x))

... and its lightning fast!!! making use of `numpy`'s powers

Save `DataFrames` as binary objects in a `.pickle` file

In [None]:
import pickle

with open('df_rec.pickle', 'wb') as f:
    pickle.dump(df_rec, f)
f.close()

In [None]:
with open('df_rec.pickle', 'rb') as f:
    df_rec_reloaded = pickle.load(f)
f.close()

In [None]:
df_rec_reloaded

Quick statistical summaries:

In [None]:
df_rec_reloaded.describe()

# `Sympy` symbolic python

![sympy](https://www.sympy.org/static/images/logo.png)

Can calculate __analytically__ derivatives, integrals etc. like in `mathematica` or `matlab`

many scientific code projects make use of it <url>https://www.sympy.org/en/index.html</url>

BUT: Use with caution, for complicated integrals which are only defined under certain conditions or piecewise the results can be inconsistent (Happened to me a couple of times)

In [None]:
import sympy

In [None]:
x, y, z = sympy.var('x y z')

x + y + z

In [None]:
f = x + y**3 - sympy.cos(z)**2

print(f.diff(x))
print(f.diff(y))
print(f.diff(z))
print(f.diff(z, z, z))

Can generate $\LaTeX$ code of the formulas, example gaussian error propagation:

In [None]:
def error(f, err_vars=None):
    from sympy import Symbol, latex
    s = 0
    latex_names = dict()
    
    if err_vars == None:
        err_vars = f.free_symbols
        
    for v in err_vars:
        err = Symbol('latex_std_' + v.name)
        s += f.diff(v)**2 * err**2
        latex_names[err] = '\\sigma_{' + latex(v) + '}'
        
    return latex(sympy.sqrt(s), symbol_names=latex_names)

E, q, r = sympy.var('E_x q r')

f = E + q**2 * r

print(f)
print(error(f))

Many functions to simplify or manipulate formulas

$ \sigma_f = \sqrt{\sigma_{E_{x}}^{2} + 4 \sigma_{q}^{2} q^{2} r^{2} + \sigma_{r}^{2} q^{4}}$

In [None]:
from sympy import symbols, simplify, expand, factor, collect, cancel, apart, diff, cos, exp, integrate, limit, sin, Function, oo

In [None]:
x, y, z, t = symbols('x y z t')
k, m, n = symbols('k m n', integer=True)
f, g, h = symbols('f g h', cls=Function)

In [None]:
simplify((x**3 + x**2 - x - 1)/(x**2 + 2*x + 1))

In [None]:
expand((x+1)**2)

In [None]:
expand((x+2)*(x-3))

In [None]:
expand((x+1)*(x-2) - (x-1)*x)

In [None]:
factor(x**2*z + 4*x*y*z + 4*y**2*z)

In [None]:
expr = x*y + x - 3 + 2*x**2 - z*x**2 + x**3
collect(expr, x)

In [None]:
cancel((x**2 + 2*x + 1)/(x**2 + x))

Analytical partial fraction decomposition

In [None]:
expr = (4*x**3 + 21*x**2 + 10*x + 12)/(x**4 + 5*x**3 + 5*x**2 + 4*x)
apart(expr)

Many inbuilt mathematical functions like `log`, `sin`, `cos` etc

In [None]:
diff(cos(x), x)

Integrate with or without limits

In [None]:
integrate(cos(x), x)

In [None]:
integrate(exp(-x), (x, 0, oo))

In [None]:
integrate(exp(-x**2), x)

In [None]:
integrate(exp(-x**2 -y**2), x, (y, -oo, oo))

Find limiting behaviour $\lim_{x\to0} \frac{\sin(x)}{x}$

In [None]:
limit(sin(x)/x, x, 0)

Calculate taylor series

In [None]:
expr = exp(sin(x))
expr.series(x, 0, 10)

In [None]:
expr.series(x, 0, 10).removeO()

Substitutions

In [None]:
expr.subs(x, y**7 + z/x)