# Array, Matrices, and Data Handling (with Numpy and Pandas)

## Part 2 (More Numpy + Introducing Pandas)

In this part, we learn some commands and functions useful for your programming. There are too many functions to cover in this lecture, so we will learn a core set of functions. Then, we add one more library to Python, Pandas, which is useful for data analytics.

## Contents

- Functions and Methods/Properties
- Convenient functions
- Special Arrays
- NaN and Inf
- Sorting and Extreme Values

In [1]:
import numpy as np

In [2]:
#linspace(a,b,n) produces a set of n points with an equal distance between a and b.

x=np.linspace(0,10,21)

In [3]:
x

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,
        5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ])

In [4]:
#logspace(a,b,n) is similar to linspace(a,b,n), except taht it makes an array of points between 10**a and 10**b

logx = np.logspace(0,1,11)

In [5]:
logx

array([ 1.        ,  1.25892541,  1.58489319,  1.99526231,  2.51188643,
        3.16227766,  3.98107171,  5.01187234,  6.30957344,  7.94328235,
       10.        ])

## arange(a,b,s)

arange(a,b,n) creates an array of numbers between a and b (b not included) with a distance $s$. So, the main difference between linspace and arange lies in the way the set of points is spaced by. If only one input is provided for the arange, say arange(b), this refers to arange(0,b,1).

In [6]:
x = np.arange(11.5)

In [7]:
x

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])

In [8]:
x = np.arange(12)

In [9]:
x

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [10]:
x=np.arange(1,10,.3)
x

array([1. , 1.3, 1.6, 1.9, 2.2, 2.5, 2.8, 3.1, 3.4, 3.7, 4. , 4.3, 4.6,
       4.9, 5.2, 5.5, 5.8, 6.1, 6.4, 6.7, 7. , 7.3, 7.6, 7.9, 8.2, 8.5,
       8.8, 9.1, 9.4, 9.7])

# You can create a mesh grid

In [11]:
x = np.arange(5)
y = np.arange(3)

In [12]:
X,Y = np.meshgrid(x,y)

In [13]:
X

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [14]:
Y

array([[0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2]])

In [15]:
np.r_[0:10:.5] # arange equivalent

array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. ,
       6.5, 7. , 7.5, 8. , 8.5, 9. , 9.5])

In [16]:
np.r_[0:10:0.3]

array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3. , 3.3, 3.6,
       3.9, 4.2, 4.5, 4.8, 5.1, 5.4, 5.7, 6. , 6.3, 6.6, 6.9, 7.2, 7.5,
       7.8, 8.1, 8.4, 8.7, 9. , 9.3, 9.6, 9.9])

In [17]:
# Additional related commands to check
# c_, ix_, mgrid_, ogrid_,...

In [18]:
# Can we generate random (normal distribution) array?

x = np.random.randn(2,10)
x

array([[-0.50673698, -0.25009892, -0.01920054,  0.52970537,  0.61354357,
         0.96921989,  1.10491053, -0.27216524, -2.02621729, -1.06230206],
       [-0.67587975, -4.70754784,  1.43732375,  1.67784936,  0.53910777,
         0.11571398, -0.41932509,  1.68539043, -0.37220132, -0.21367025]])

## You can create random samples from various other distributions:
https://numpy.org/doc/1.16/reference/routines.random.html

In [19]:
#random permutation of x drawn above
np.random.permutation(x)

array([[-0.67587975, -4.70754784,  1.43732375,  1.67784936,  0.53910777,
         0.11571398, -0.41932509,  1.68539043, -0.37220132, -0.21367025],
       [-0.50673698, -0.25009892, -0.01920054,  0.52970537,  0.61354357,
         0.96921989,  1.10491053, -0.27216524, -2.02621729, -1.06230206]])

In [20]:
np.sum(x)

-1.8525806217517637

In [21]:
x.sum()

-1.8525806217517637

In [22]:
np.sum(x,1)

array([-0.91934168, -0.93323894])

In [23]:
x.sum(1)

array([-0.91934168, -0.93323894])

In [24]:
np.sum(x,0)

array([-1.18261673, -4.95764676,  1.41812321,  2.20755474,  1.15265134,
        1.08493387,  0.68558544,  1.41322519, -2.39841861, -1.27597231])

In [25]:
np.cumsum(x,0)

array([[-0.50673698, -0.25009892, -0.01920054,  0.52970537,  0.61354357,
         0.96921989,  1.10491053, -0.27216524, -2.02621729, -1.06230206],
       [-1.18261673, -4.95764676,  1.41812321,  2.20755474,  1.15265134,
         1.08493387,  0.68558544,  1.41322519, -2.39841861, -1.27597231]])

In [26]:
np.cumsum(x,1)

array([[-0.50673698, -0.7568359 , -0.77603645, -0.24633107,  0.3672125 ,
         1.33643238,  2.44134291,  2.16917767,  0.14296038, -0.91934168],
       [-0.67587975, -5.38342758, -3.94610383, -2.26825447, -1.7291467 ,
        -1.61343271, -2.0327578 , -0.34736738, -0.71956869, -0.93323894]])

In [27]:
# As methods,...
x.cumsum(1)

array([[-0.50673698, -0.7568359 , -0.77603645, -0.24633107,  0.3672125 ,
         1.33643238,  2.44134291,  2.16917767,  0.14296038, -0.91934168],
       [-0.67587975, -5.38342758, -3.94610383, -2.26825447, -1.7291467 ,
        -1.61343271, -2.0327578 , -0.34736738, -0.71956869, -0.93323894]])

In [28]:
x.cumsum(0)

array([[-0.50673698, -0.25009892, -0.01920054,  0.52970537,  0.61354357,
         0.96921989,  1.10491053, -0.27216524, -2.02621729, -1.06230206],
       [-1.18261673, -4.95764676,  1.41812321,  2.20755474,  1.15265134,
         1.08493387,  0.68558544,  1.41322519, -2.39841861, -1.27597231]])

In [29]:
x

array([[-0.50673698, -0.25009892, -0.01920054,  0.52970537,  0.61354357,
         0.96921989,  1.10491053, -0.27216524, -2.02621729, -1.06230206],
       [-0.67587975, -4.70754784,  1.43732375,  1.67784936,  0.53910777,
         0.11571398, -0.41932509,  1.68539043, -0.37220132, -0.21367025]])

In [30]:
np.diff(x)

array([[ 0.25663806,  0.23089838,  0.54890592,  0.08383819,  0.35567632,
         0.13569064, -1.37707577, -1.75405205,  0.96391524],
       [-4.03166809,  6.14487159,  0.24052561, -1.13874159, -0.42339379,
        -0.53503907,  2.10471552, -2.05759174,  0.15853107]])

In [31]:
np.diff(x,axis=0)

array([[-0.16914277, -4.45744891,  1.45652429,  1.14814399, -0.07443579,
        -0.8535059 , -1.52423562,  1.95755567,  1.65401598,  0.84863181]])

In [32]:
np.diff(x,1,axis=0)

array([[-0.16914277, -4.45744891,  1.45652429,  1.14814399, -0.07443579,
        -0.8535059 , -1.52423562,  1.95755567,  1.65401598,  0.84863181]])

## Sorting?

In [33]:
x = np.random.randn(5,3)

In [34]:
x

array([[-1.99571195, -1.44261688,  2.11716732],
       [ 0.58139515,  0.23030174,  0.73083982],
       [ 1.10361846, -0.25402342, -0.62177902],
       [-0.16136988,  0.16125194, -0.39747914],
       [-1.38981832, -0.90989114, -0.97725964]])

In [35]:
np.sort(x)

array([[-1.99571195, -1.44261688,  2.11716732],
       [ 0.23030174,  0.58139515,  0.73083982],
       [-0.62177902, -0.25402342,  1.10361846],
       [-0.39747914, -0.16136988,  0.16125194],
       [-1.38981832, -0.97725964, -0.90989114]])

In [36]:
np.sort(x,0)

array([[-1.99571195, -1.44261688, -0.97725964],
       [-1.38981832, -0.90989114, -0.62177902],
       [-0.16136988, -0.25402342, -0.39747914],
       [ 0.58139515,  0.16125194,  0.73083982],
       [ 1.10361846,  0.23030174,  2.11716732]])

In [37]:
np.sort(x,axis=None)

array([-1.99571195, -1.44261688, -1.38981832, -0.97725964, -0.90989114,
       -0.62177902, -0.39747914, -0.25402342, -0.16136988,  0.16125194,
        0.23030174,  0.58139515,  0.73083982,  1.10361846,  2.11716732])

## Nan Functions

In [38]:
x = np.random.randn(5)
x

array([-2.46492526, -0.24485475,  1.96747993,  0.6757862 , -0.59534472])

In [39]:
x[1]= np.nan #You can add a nan number using this way.
x

array([-2.46492526,         nan,  1.96747993,  0.6757862 , -0.59534472])

In [40]:
np.sum(x)

nan

In [41]:
np.nansum(x)

-0.417003845039024

In [42]:
sum(x[np.logical_not(np.isnan(x))]) # A verbose, alternative way

-0.417003845039024

In [43]:
np.isnan(x) # What does 'isnan' do?

array([False,  True, False, False, False])

In [44]:
np.mean(x)

nan

In [45]:
np.nanmean(x)

-0.104250961259756

In [46]:
np.nancumsum(x)

array([-2.46492526, -2.46492526, -0.49744533,  0.17834087, -0.41700385])

Also check nanmax, nanargmax, nanmin, nanargmin, etc.

# There are useful functions that generate arrays we often need to build

## ones
## zeros
## empty
## eye, identity



In [47]:
np.ones((3,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [48]:
np.zeros((3,3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [49]:
np.empty((2,2))

array([[-2.46492526,  1.96747993],
       [ 0.6757862 , -0.59534472]])

In [50]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

## Reshaping arrays
You can reshape or resize ndarray objects. Reshape function provides another view on the same data points, whereas Resize function produces a new object.


In [51]:
X = np.arange(16)

In [52]:
X

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [53]:
X[1] =20 # Remeber, you can modify some elements in array.
X

array([ 0, 20,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [54]:
X.shape

(16,)

In [55]:
X = np.reshape(X,(2,8))
X

array([[ 0, 20,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15]])

In [56]:
X2 = np.resize(X,(3,3))
X2

array([[ 0, 20,  2],
       [ 3,  4,  5],
       [ 6,  7,  8]])

In [57]:
X3 = np.resize(X,(2,5))
X3

array([[ 0, 20,  2,  3,  4],
       [ 5,  6,  7,  8,  9]])

In [58]:
X4 = np.resize(X,(5,5))
X4

array([[ 0, 20,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15,  0, 20,  2,  3],
       [ 4,  5,  6,  7,  8]])

# Linear algebra functions

Note: Not all, but some functions use the following format (numpy.linalg.XXX)

## T or transpose
## diag
## triu, tril
## linalg.svd

## lstsq
## cholesky
## det
## eig
## inv
## trace
## kron
## matrix_rank

In [59]:
X

array([[ 0, 20,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15]])

In [60]:
X.T # transpose

array([[ 0,  8],
       [20,  9],
       [ 2, 10],
       [ 3, 11],
       [ 4, 12],
       [ 5, 13],
       [ 6, 14],
       [ 7, 15]])

In [61]:
#Alternatively,
X.transpose()

array([[ 0,  8],
       [20,  9],
       [ 2, 10],
       [ 3, 11],
       [ 4, 12],
       [ 5, 13],
       [ 6, 14],
       [ 7, 15]])

In [62]:
np.diag(X4) # Diagonal elements

array([ 0,  6, 12,  2,  8])

In [63]:
np.triu(X4) #Upper triangular matrix

array([[ 0, 20,  2,  3,  4],
       [ 0,  6,  7,  8,  9],
       [ 0,  0, 12, 13, 14],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  8]])

In [64]:
np.tril(X) #Lower triangular matrix

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [8, 9, 0, 0, 0, 0, 0, 0]])

In [65]:
X = X.reshape((4,4))
X

array([[ 0, 20,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [66]:
y =np.reshape(np.random.randn(128,1),(32,4))

In [67]:
u, s, vh = np.linalg.svd(y) # singular value decomposition X=USV'

In [68]:
np.size(u)

1024

In [69]:
s

array([6.86400175, 5.84625514, 4.90807011, 4.39638034])

In [70]:
vh

array([[ 0.48434657,  0.85941367,  0.14922706,  0.06743757],
       [-0.29774866,  0.33727254, -0.7770301 , -0.44024673],
       [-0.1541849 ,  0.0560803 ,  0.5481064 , -0.82015937],
       [ 0.80807249, -0.3801452 , -0.27117315, -0.35912895]])

In [71]:
x = np.array([[1,.5],[.5,1]])
x

array([[1. , 0.5],
       [0.5, 1. ]])

In [72]:
xInv = np.linalg.inv(x) #Inverse matrix

In [73]:
xInv

array([[ 1.33333333, -0.66666667],
       [-0.66666667,  1.33333333]])

# An Example using Numpy: Regression

Let's take a look at the following linear equation system.

\begin{align}
 b_1 + b_2 = 5 \\
 b_1 + b_2 = 3 \\
 b_1 + b_2 = 7 \\
 b_2 = 1
 \end{align}

In matrix form,

\begin{equation*}
\begin{bmatrix}
1 & 1 \\
1 & 1 \\
1 & 1 \\
0 & 1
\end{bmatrix}
\begin{bmatrix}
b_1 \\
b_2 \\
\end{bmatrix} =
\begin{bmatrix}
5 \\
3 \\
7 \\
1
\end{bmatrix}
\end{equation*}
or
$Xb = y$

As you can see, this system has no solutions. But what if we think of $X$ as input data and $y$ as outcome data. For instance, the first column of $X$ is an indicator that shows whether a company pay makes positive profits, and the second column of $X$ indicates if the company pays dividend. Suppose that $y$ is the current stock price of the four companies.

The dimension of $X$ is $4 \times 2$. So, we can reduce the dimension by multiplying both sides by $X^T$ (transpose):

\begin{align}
X^T X b = X^T y ⟹ b = (X^T X)^{-1} X^T y
\end{align}

This is the punchline of regression, which is a form of projection to find a solution from this type of equation systems.

Let's code this in the below.

In [74]:
X = np.array([1,1,1,1,1,1,0,1])

In [75]:
X = np.reshape(X,(4,2)) # Alternatively, X = X.reshape([4,2])
X

array([[1, 1],
       [1, 1],
       [1, 1],
       [0, 1]])

In [76]:
y = np.array([5,3,7,1])
y = y.reshape([4,1])
y

array([[5],
       [3],
       [7],
       [1]])

Because it is annoying to keep using np.linalg.inv if we need to keep using the inverse function, we can use 'from numpy.linalg import inv'.

In [77]:
from numpy.linalg import inv

b = inv(X.T @ X) @ (X.T @ y)

print(b)

[[4.]
 [1.]]


# An Example with Numpy: Black-Scholes Model of European Option Pricing (Monte Carlo Simulation Method)
The underlying stock (asset) price at time $T$ is described by the following equation:

$ S_T = S_0 exp \left( (r- \frac{1}{2}\sigma^2)T + \sigma \sqrt{T} z \right)$, where $z$ is the standard Brownian motion under a risk-neutral ($Q$) measure, and $r$ is the constant risk-free rate.

Then, a derivate asset such as an European call option value today (time $0$) with strike price $K$ and an expiry ($T$) under the no-arbitrage condition is

\begin{equation}
C_0 = e^{-rT} E_0^Q \left( max(S_{T}-K,0)\right)
\end{equation}

There are several ways to solve the above problem but a simulation-based method uses the ideas that i) the expectation value is analogous to the sample average (analogy principle), and ii) the computer can simulate many sample paths to comput such averages. That is, we simulate $S$ up to time $T$ for $I$ times then compute the average with the option payoff:

\begin{equation}
C_0 \approx e^{-rT} \frac{1}{I} \sum_{i=1}^{I}( max(S_{T}(i)-K,0))
\end{equation}



In [78]:
from math import sqrt, log

# Parameters
S0 = 100.0;
K = 101.0;
T = 1.0;
r = 0.02;
sigma = 0.15;

I=5*10**5;

z= np.random.standard_normal(I)

ST = S0*np.exp((r-0.5*sigma ** 2)*T+sigma*sqrt(T)*z)

hT = np.maximum(ST-K,0)

C0= np.exp(-r*T)*np.mean(hT)

print('Value of the European Call option %5.3f.' % C0)



Value of the European Call option 6.464.


Compare with the actual formula

In [79]:
from scipy.stats import norm

# S is stock price
# K is strike price
# T is maturity
# r is continuously compounded rate
# sigma is the volatility of stock price
def call(S, K, T, r, sigma):
    cdf = norm.cdf
    d1 = (log(S/K) + (r + sigma**2 / 2.)*T) / (sigma * sqrt(T))
    d2 = d1 - sigma * sqrt(T)
    return S * cdf(d1) - K * np.exp(- r * T) * cdf(d2)

def main():
    print(call(100, 101, 1.0, 0.02, 0.15))

In [80]:
main()

6.461925840199832


Question: Can we use the same method to price Americal-style derivatives in which exercise timing is also to be chosen?



\begin{equation}
V_0 = sup_{\tau \in \{0,\Delta t, 2 \Delta t,...,T \}} e^{-rT} E_0^Q \left( F_{\tau}(S_{\tau})\right)
\end{equation}

 After all, we can think recursively of the structure of this problem by solving backwards.

At each given point of time $t$ and the price of the underlying asset $s$,

\begin{equation}
V_t(s) = max (F_t(s),C_t(s)),
\end{equation}
where $C_t(s) = E_t^Q(e^{-r \Delta t} V_{t+\Delta t}(S_{t+\Delta t})|S_t=s)$ is the continuation value of the option given $S_t = s$

Let's try solving for this after we learn function and loops.



Question: Can we extend the above model to include a stochastic and time-varying volatility? e.g., Heston model or k-regime volatility model

Yes. We can try an intuitive one. For instance, there are two possible volatility values $\sigma_{low}$ and $\sigma_{high}$ with the latter being higher than the former. In most cases, stock volatility is low, but from time to time, volatility is high and persistent for a while.



# Pandas

## Data Structures

- Series, DataFrames, and Panels
- A Series behaves similar to a NumPy array.
- We can set up a Series via a list, tuple, array, or a dictionary.
- A Series has another column, called an index, which makes important differnces.


In [81]:
import pandas as pd

In [82]:
a = np.array([0.1, 1.2, 2.3, 3.4, 4.5])
a

array([0.1, 1.2, 2.3, 3.4, 4.5])

In [83]:
s=pd.Series(a)
s

0    0.1
1    1.2
2    2.3
3    3.4
4    4.5
dtype: float64

In [84]:
s = pd.Series([0.1, 1.2, 2.3, 3.4, 4.5], index = ['a','b','c','d','e'])

In [85]:
s

a    0.1
b    1.2
c    2.3
d    3.4
e    4.5
dtype: float64

In [86]:
s['a']

0.1

In [87]:
s[0]

0.1

In [88]:
s.iloc[0]

0.1

In [89]:
s.loc['a']

0.1

In [90]:
s[['a','d']]

a    0.1
d    3.4
dtype: float64

In [91]:
s.iloc[:3]

a    0.1
b    1.2
c    2.3
dtype: float64

In [92]:
s.loc[['a','d']]

a    0.1
d    3.4
dtype: float64

In [93]:
s.iloc[[0,3]]

a    0.1
d    3.4
dtype: float64

In [94]:
s1 = pd.Series([0.1, 1.2, 2.3, 3.4, 4.5], index = ['a','b','c','a','b'])

In [95]:
s1.loc['a']

a    0.1
a    3.4
dtype: float64

In [96]:
s1.describe() # Create summary statistics

count    5.000000
mean     2.300000
std      1.739253
min      0.100000
25%      1.200000
50%      2.300000
75%      3.400000
max      4.500000
dtype: float64

In [97]:
s2 = pd.Series(np.arange(1.0,4.0),index=['a','b','c'])
s2

a    1.0
b    2.0
c    3.0
dtype: float64

In [98]:
s3 = pd.Series(np.arange(1.0,4.0),index=['c','d','e'])

In [99]:
s4 = s2 + s3

In [100]:
s4

a    NaN
b    NaN
c    4.0
d    NaN
e    NaN
dtype: float64

In [101]:
s4.dropna()

c    4.0
dtype: float64

In [102]:
# You can drop a specific element using drop()
s4.drop('a')

b    NaN
c    4.0
d    NaN
e    NaN
dtype: float64

In [103]:
s4.drop(['d','e'])

a    NaN
b    NaN
c    4.0
dtype: float64

In [104]:
# You can fill all null values in a series with a specific value

s4.fillna(0.0)

a    0.0
b    0.0
c    4.0
d    0.0
e    0.0
dtype: float64

In [105]:
s4.append(pd.Series([5,4,3,2,1],index=['f','g','h','k','l']))

  s4.append(pd.Series([5,4,3,2,1],index=['f','g','h','k','l']))


a    NaN
b    NaN
c    4.0
d    NaN
e    NaN
f    5.0
g    4.0
h    3.0
k    2.0
l    1.0
dtype: float64

In [106]:
# update

s1 = pd.Series(np.arange(1.0,4.0),index=['a','b','c'])
s1

a    1.0
b    2.0
c    3.0
dtype: float64

In [107]:
s2 = pd.Series(-1.0*np.arange(1.0,4.0),index=['c','d','e'])

In [108]:
s1.update(s2)

In [109]:
s1

a    1.0
b    2.0
c   -1.0
dtype: float64

You can create a series using dictionaries.

In [110]:
s = pd.Series({'a':0.1 ,'b': 1.2, 'c': 2.3, 'd':3.4, 'e': 4.5})

In [111]:
s*3

a     0.3
b     3.6
c     6.9
d    10.2
e    13.5
dtype: float64

In [112]:
2*s-2

a   -1.8
b    0.4
c    2.6
d    4.8
e    7.0
dtype: float64

In [113]:
s1 = pd.Series(np.arange(10.0,20.0))

In [114]:
s1.describe()

count    10.00000
mean     14.50000
std       3.02765
min      10.00000
25%      12.25000
50%      14.50000
75%      16.75000
max      19.00000
dtype: float64

In [115]:
sumstat = s1.describe()

In [116]:
sumstat['max']

19.0

In [117]:
for it in range(0,10):
  print(it)
  it += 1

0
1
2
3
4
5
6
7
8
9
