### Broadcasting
* duplicate a smaller array so we can add and subtract arrays that aren't the same size
* "    The term broadcasting describes how numpy treats arrays with different shapes during arithmetic operations. Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes." - scipy
* NumPy does not actually duplicate the smaller array; instead, it makes memory and computationally efficient use of existing structures in memory that in effect achieve the same result.

In [36]:
import numpy as np

In [9]:
# one dimensional
array = np.array([2,2,2,2])
array + 1

array([3, 3, 3, 3])

In [10]:
array = np.array([[1,1],[1,1]])
array + 1

array([[2, 2],
       [2, 2]])

* The dimensions are considered in reverse order, starting with the trailing dimension; for example, looking at columns before rows in a two-dimensional case.

In [27]:
array = np.array([
    [1,1,3],
    [2,2,3]
])

# we have 2 rows and 3 columns
print(array.shape)

(2, 3)


* consider a case where we have a 2 by 2

In [46]:
array = np.array([
    [1,1],
    [2,2]
])

array1 = np.array([1,5])
array1 = np.reshape(array1, (1,2))
print(array.shape)
print(array1.shape)

(2, 2)
(1, 2)


In [38]:
array

array([[1, 1],
       [2, 2]])

In [40]:
array1

array([[1, 5]])

In [47]:
# each vector is added to each row
# since our second dimension aligns, it adds
# along the columns, because that is our second dimension
# think of it as "hoping" down the, like long, like a column
array + array1

array([[2, 6],
       [3, 7]])

In [41]:
# remember when we sum using axis = 1, we sum columns
# so when our axis 1 aligns above, we are hoping down the columns
# and broadcasting that vecotr
# in the below case, we are hoping down each column and adding them together
np.sum(array, axis = 1)

array([2, 4])

In [42]:
array = np.array([
    [1,1],
    [2,2]
])

array1 = np.array([1,5])
array1 = np.reshape(array1, (2,1))
print(array.shape)
print(array1.shape)

(2, 2)
(2, 1)


In [44]:
array

array([[1, 1],
       [2, 2]])

In [45]:
array1

array([[1],
       [5]])

In [43]:
# if we  flip the dimensions of array 1, the behavior changes
# now the first dimension (Rows aligns)
# we "hop" across and broadcast
array + array1

array([[2, 2],
       [7, 7]])

In [15]:
array

array([[1, 1],
       [2, 2]])

In [14]:
# when we make use of our first axis here, to sum
# we sort of hop along each column and add them together
# similir to how we are broadings, we are hoping along
# each column and adding that vector
np.sum(array, axis = 0)

array([3, 3])

* can only be performed when the shape of each dimension in the arrays are equal or one has the dimension size of 1

In [49]:
array = np.array([1,2])
array1 = np.array([1,2,3])
array + array1

ValueError: operands could not be broadcast together with shapes (2,) (3,) 

In [12]:
array = np.array([
    [1,1,1,1],
    [1,1,1,1]
])

# can't just  expand this out or double it
array1 = np.array([
    [1,1],
    [1,1]
])

array + array1

ValueError: operands could not be broadcast together with shapes (2,4) (2,2) 

In [15]:
array = np.array([
    [1,1,1,1],
    [1,1,1,1]
])

# can't just  expand this out or double it
array1 = np.array([
    [1,1],
    [1,1]
])

# could reshape it though
array1 = np.reshape(array1, (1,4))

array + array1

array([[2, 2, 2, 2],
       [2, 2, 2, 2]])

In [48]:
array = np.array([
    [1,1,1,1],
    [1,1,1,1]
])

# can't just  expand this out or double it
array1 = np.array([
    [1],
    [1]
])

array + 5

for row in array:
    for row item in row:
        item + 5

array([[6, 6, 6, 6],
       [6, 6, 6, 6]])

* play around with it

### Apply Pandas

In [1]:
import pandas as pd

In [53]:
data = [
    [1,2,3],
    [4,5,6]
]

df = pd.DataFrame(data)
df.columns = ["a", "b", "c"]
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [62]:
# when we do apply returns a series
l = lambda x: type(x)
df.apply(l, axis = 0)

a    <class 'pandas.core.series.Series'>
b    <class 'pandas.core.series.Series'>
c    <class 'pandas.core.series.Series'>
dtype: object

In [64]:
# remember this is a series where each series is a column of data
# so we can access elements of that series using bracket notation
# here we access the first element of each column
# think of this as slice 1
l = lambda x: x[0]
df.apply(l, axis = 0)

a    1
b    2
c    3
dtype: int64

In [65]:
# and slice two
l = lambda x: x[1]
df.apply(l, axis = 0)

a    4
b    5
c    6
dtype: int64

In [66]:
#  now with rows
l = lambda x: type(x)
df.apply(l, axis = 1)

0    <class 'pandas.core.series.Series'>
1    <class 'pandas.core.series.Series'>
dtype: object

In [67]:
l = lambda x: x["a"]
df.apply(l, axis = 1)

0    1
1    4
dtype: int64

In [68]:
l = lambda x: x["b"]
df.apply(l, axis = 1)

0    2
1    5
dtype: int64

In [70]:
# let's see it in action
data = [
    [1,2,"A"],
    [4,5,"B"],
    [1,2,"C"]
]

df = pd.DataFrame(data)
df.columns = ["a", "b", "c"]
df

Unnamed: 0,a,b,c
0,1,2,A
1,4,5,B
2,1,2,C


In [71]:
def f(x):
    if x == "A":
        return "this is a"
    elif x == "B":
        return "this is b"
    else:
        return "this is c"

In [84]:
# being applied to specific column
df["new_col"] = df["c"].apply(f)
df

Unnamed: 0,a,b,c,new_col
0,1,2,A,this is a
1,4,5,B,this is b
2,1,2,C,this is c


In [90]:
l = lambda x: x + " x"
df["c"].apply(l)

0    A x
1    B x
2    C x
Name: c, dtype: object

### Piping

In [19]:
data = [
    [1,2],
    [4,5],
    [1,2]
]

df = pd.DataFrame(data)
df.columns = ["a", "b"]
df

Unnamed: 0,a,b
0,1,2
1,4,5
2,1,2


In [20]:
df.pipe(lambda x: x + 2).pipe(lambda x: x - 5)
df

Unnamed: 0,a,b
0,1,2
1,4,5
2,1,2


In [29]:
# the first argument of pipe takes the "data" argument, which is just passing in the dataframe
# then we can think of the function as generally manipulating the dataframe
data = [
    [1,2],
    [4,5],
    [1,2]
]

df = pd.DataFrame(data)
df.columns = ["a", "b"]

def fa(x):
    x["a"] = 5
    return x

def fb(x):
    x["b"] = 10
    return x
    
    
df.pipe(fa).pipe(fb)
df

Unnamed: 0,a,b
0,5,10
1,5,10
2,5,10


In [31]:
data = [
    [1,2],
    [4,5],
    [1,2]
]

df = pd.DataFrame(data)
df.columns = ["a", "b"]

df = fa(df)
df = fb(df)
df

Unnamed: 0,a,b
0,5,10
1,5,10
2,5,10


In [None]:
# pipe on single columns

In [106]:
def f(x,i):
    return x-i

In [108]:
df.pipe(f,i = 10)

Unnamed: 0,a,b
0,-9,-8
1,-6,-5
2,-9,-8


### Rolling Functions

* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

In [114]:
# let's see it in action
data = [
    [1,"A","1"],
    [4,"A","2"],
    [1,"A","3"],
    [1,"A","4"],
    [4,"A","5"],
    [1,"A","6"]
]

df = pd.DataFrame(data)
df.columns = ["val", "id", "period"]
df

Unnamed: 0,val,id,period
0,1,A,1
1,4,A,2
2,1,A,3
3,1,A,4
4,4,A,5
5,1,A,6


In [116]:
df["val"].rolling(2).sum()

0    NaN
1    5.0
2    5.0
3    2.0
4    5.0
5    5.0
Name: val, dtype: float64

In [117]:
df["val"].rolling(3).sum()

0    NaN
1    NaN
2    6.0
3    6.0
4    6.0
5    6.0
Name: val, dtype: float64

In [118]:
df["val"].rolling(2).mean()

0    NaN
1    2.5
2    2.5
3    1.0
4    2.5
5    2.5
Name: val, dtype: float64

In [119]:
df["val"].rolling(4).mean()

0     NaN
1     NaN
2     NaN
3    1.75
4    2.50
5    1.75
Name: val, dtype: float64

In [120]:
df["val"].rolling(2).median()

0    NaN
1    2.5
2    2.5
3    1.0
4    2.5
5    2.5
Name: val, dtype: float64

In [121]:
df["rolling_mean"] = df["val"].rolling(2).mean()

In [122]:
df

Unnamed: 0,val,id,period,rolling_mean
0,1,A,1,
1,4,A,2,2.5
2,1,A,3,2.5
3,1,A,4,1.0
4,4,A,5,2.5
5,1,A,6,2.5


In [125]:
periods = [1,2,3,4]
for p in periods:
    name = "rolling_mean_p_{}".format(str(p))
    df[name] = df["val"].rolling(p).mean()

In [126]:
df

Unnamed: 0,val,id,period,rolling_mean,rolling_mean_p_1,rolling_mean_p_2,rolling_mean_p_3,rolling_mean_p_4
0,1,A,1,,1.0,,,
1,4,A,2,2.5,4.0,2.5,,
2,1,A,3,2.5,1.0,2.5,2.0,
3,1,A,4,1.0,1.0,1.0,2.0,1.75
4,4,A,5,2.5,4.0,2.5,2.0,2.5
5,1,A,6,2.5,1.0,2.5,2.0,1.75


### Window Functions

In [27]:
# let's see it in action
data = [
    [1,"A","1"],
    [4,"A","2"],
    [1,"A","3"],
    [1,"A","4"],
    [4,"A","5"],
    [1,"A","6"],
    [11,"B","1"],
    [4,"B","2"],
    [15,"B","3"],
    [8,"B","4"],
    [7,"B","5"],
    [4,"B","6"]
]

df = pd.DataFrame(data)
df.columns = ["val", "id", "period"]
df

Unnamed: 0,val,id,period
0,1,A,1
1,4,A,2
2,1,A,3
3,1,A,4
4,4,A,5
5,1,A,6
6,11,B,1
7,4,B,2
8,15,B,3
9,8,B,4


In [33]:
df["rank"] = df.groupby("id")["val"].rank("min")
df.sort_values(["id", "rank"])

Unnamed: 0,val,id,period,rank
0,1,A,1,1.0
2,1,A,3,1.0
3,1,A,4,1.0
5,1,A,6,1.0
1,4,A,2,5.0
4,4,A,5,5.0
7,4,B,2,1.0
11,4,B,6,1.0
10,7,B,5,3.0
9,8,B,4,4.0


In [7]:
df.groupby('id')['val'].rolling(2).sum().reset_index()

Unnamed: 0,id,level_1,val
0,A,0,
1,A,1,5.0
2,A,2,5.0
3,A,3,2.0
4,A,4,5.0
5,A,5,5.0
6,B,6,
7,B,7,15.0
8,B,8,19.0
9,B,9,23.0


### Time Series in Pandas
* https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [3]:
import pandas as pd

In [17]:
data = [
    ["2016-01-01", 10],
    ["2016-02-01", 20],
    ["2017-05-01", 15],
    ["2018-01-01", 25],
    ["2019-01-01", 31],
    ["2020-01-01", 42]
]

df = pd.DataFrame(data, columns = ["date", "metric"])

In [18]:
df.dtypes

date      object
metric     int64
dtype: object

In [19]:
df["date"] = pd.to_datetime(df["date"])

In [20]:
df.dtypes

date      datetime64[ns]
metric             int64
dtype: object

In [21]:
df

Unnamed: 0,date,metric
0,2016-01-01,10
1,2016-02-01,20
2,2017-05-01,15
3,2018-01-01,25
4,2019-01-01,31
5,2020-01-01,42


In [22]:
df = df.set_index("date")

In [23]:
df

Unnamed: 0_level_0,metric
date,Unnamed: 1_level_1
2016-01-01,10
2016-02-01,20
2017-05-01,15
2018-01-01,25
2019-01-01,31
2020-01-01,42


In [24]:
df.index

DatetimeIndex(['2016-01-01', '2016-02-01', '2017-05-01', '2018-01-01',
               '2019-01-01', '2020-01-01'],
              dtype='datetime64[ns]', name='date', freq=None)

In [26]:
df["2016"]

Unnamed: 0_level_0,metric
date,Unnamed: 1_level_1
2016-01-01,10
2016-02-01,20


### Raw Strings

In [8]:
print("hello there \nthis will do a return")

hello there 
this will do a return


In [7]:
print(r"hello there \nthis will not do a return")

hello there \nthis will not do a return


In [4]:
print("\t this will tab")

	 this will tab


In [6]:
print(r"\t this will not tab")

\t this will not tab


### Printing in Jupyter

In [None]:
x = 2
y = 2

x
y