In [None]:
# this is just to change the presentation style... Please ignore!
from IPython.display import display, HTML
display(HTML("""<style>
.cm-line { font-size: large !important; }
.dataframe tbody tr th { font-size: large !important; }
.dataframe tbody tr { font-size: large !important; }
.dataframe thead th { font-size: large !important; }
.dataframe thead { font-size: large !important; }
</style>"""))

# Let's install numpy and pandas

In [None]:
# Execute this cell...  (or use conda)
!pip3 install numpy pandas

# Numpy  (https://numpy.org)

- a library for scientific computing
- n-dimentional arrays (lists of lists of lists)
- fast and memory-optimised, written in C
- corner-stone of many other libraries
- many scientific functions

![image.png](attachment:image.png)

## Importing Numpy

In [None]:
# everybody imports it like this:
import numpy as np
np.__version__

# Lists

In [None]:
# Very much the same... right?
py_list = [1, 2, 3, 4, 5]

# print it...
print("The list:", py_list)

# access
print("Third Element:", py_list[2])


## in numpy...

In [None]:
np_arr = np.array(py_list)

# print it...
print("The array:", np_arr)

# access
print("Third Element:", np_arr[2])

### Business as usual, right?

# Not quite! Advanced indexing and slicing!

(numpy allows multi-indexing)


![image.png](attachment:image.png)

*Source: https://betterprogramming.pub/numpy-illustrated-the-visual-guide-to-numpy-3b1d4976de1d*

## Lists of Lists

In [None]:
twoD_list = [[1,2,3],[4,5,6],[7,8,9]]

# print it...
print("List of Lists:\n", twoD_list)

# access
print("Second of third:", twoD_list[ 2][1])

# But what if we want a sub-matrix? 
# how about slicing 2nd and 3rd element of all rows?

sub = [row[1:] for row in twoD_list]
print("Sub-lists\n", sub)  # 0:2 returns a list

## 2D Arrays instead

In [None]:
# indices in the form of start[:end[:step]]
twoD_arr = np.array(twoD_list)

# print it...
print("2D-List:\n", twoD_arr)

# access via multi-index
print("Second of third:", twoD_arr[2,1]) 

# How about slicing out a sub-matrix
print("Sub-matrix:\n", twoD_arr[:,1:])

## Let's practice!
`start[:end[:step]]`   
Shorthand: `[::]` = `[:]` = `[0:-1:1]`  
But: can also use lists e.g. `arr[[1,3,5]]`

In [None]:
# Let's create a 7x7 array
s_by_s = np.arange(49).reshape((7,7))
print(s_by_s)

In [None]:
# rows [14 ...] + [21 ...]
# every second col starting from 1
# 

# What else?

## More levels?

In [None]:
# Easier list / matrix creation...
threeD_arr = np.arange(0,27).reshape(3,3,3)
threeD_arr

In [None]:
# What will this produce?
threeD_arr[:,1]

In [None]:
# and this?
threeD_arr[:,:,2]

## Conveniently apply functions on arrays

In [None]:
# construct a 2D  array
twenty_ints = np.linspace(0,95, 20, dtype=np.int8)
twoD_arr = twenty_ints.reshape(4, -1)
twoD_arr

In [None]:
twoD_arr.mean()  # mean of all

In [None]:
twoD_arr.mean(axis=0)  # means of columns

In [None]:
twoD_arr.mean(axis=1) # means of rows

## How about using booleans to address?

In [None]:
# Create a random array
random = np.random.randint(0, 100, size=(5,5))
random

In [None]:
random > 50

In [None]:
random[random > 50] = 0
random

## Many more functions available

- Array creation `np.empty()`, `np.zeros()`, `np.ones()`
- Binary operations
- Logic functions
- String operations
- Date operations

- Data types + sizes (e.g. `np.uint8`, `np.float32`) 
- Random sampling
- Linear algebra & matrices

See also https://numpy.org/doc/stable/reference/index.html

## Questions ?

# Pandas  (https://pandas.pydata.org/)

- data series and data frames (think: tables with rows and columns)
- VERY widespread, especially in data science
- explore, filter, merge, clean, modify, plot data

"Excel + SQL + ..." (but for ~~nerds~~ programmers)

In [None]:
import pandas as pd
pd.__version__

## Let's import some example data...

In [None]:
df = pd.read_csv("data/Bezirke.csv")
df

# Exploring Data

In [None]:
df.describe()

## Single column statistics

In [None]:
df.Population.describe()

## Plotting Data

In [None]:
_ = df.plot.scatter(x="Area", y="Population")

In [None]:
_ = df.plot.scatter(x="Towns", y="Population", c="orange")

In [None]:
_ = df.Population.plot.hist(title="Histogram of town sizes", xlabel="Size")

## Boxplots (for statistics)

In [None]:
_ = df.Area.plot.box()

## Project and Select (think of Databases)

In [None]:
# Project (select columns)
df[["District", "Area"]]

## Select a few lines

In [None]:
df.loc[0:5] 

Note: `.iloc[]` also exists, but we won't need it here... 

## Filtering Data based on attributes


In [None]:
# Select where 
df.Bundesland == "Niederösterreich"

In [None]:
df[df.Bundesland == "Niederösterreich"]

# Apply Aggregation Functions to DF

In [None]:
df.sum(numeric_only=True)

In [None]:
df.mean(numeric_only=True)

# GroupBy

In [None]:
df.groupby(by="Bundesland").Area.sum()

In [None]:
df.groupby(by="Bundesland").Area.describe()

In [None]:
df.groupby(by="Bundesland").Area.count()

# Apply functions & create new columns

In [None]:
df["AreaHektar"] = df.Area * 100
df

In [None]:
df["AreaHektar"] = df.Area.apply(lambda x : x * 100)
df

# Once you know the basics...

- Check out plotting with [Seaborn](https://seaborn.pydata.org/) / [Plotly](https://plotly.com/python/) / [hvPlot](https://hvplot.holoviz.org/)
- statistics (using [scipy](https://scipy.org/))
- pivoting, melting, ...
- merging data rrames, using (live) data sources
- apply complex functions to dataframes
- query data

# Questions ?